mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-26 11:08:13 +00:00
add wrapper class for place data passed to tokenizer
This is mostly for convenience and documentation purposes.
This commit is contained in:
44
nominatim/indexer/place_info.py
Normal file
44
nominatim/indexer/place_info.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
Wrapper around place information the indexer gets from the database and hands to
|
||||||
|
the tokenizer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import psycopg2.extras
|
||||||
|
|
||||||
|
class PlaceInfo:
|
||||||
|
""" Data class containing all information the tokenizer gets about a
|
||||||
|
place it should process the names for.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, info):
|
||||||
|
self._info = info
|
||||||
|
|
||||||
|
|
||||||
|
def analyze(self, analyzer):
|
||||||
|
""" Process this place with the given tokenizer and return the
|
||||||
|
result in psycopg2-compatible Json.
|
||||||
|
"""
|
||||||
|
return psycopg2.extras.Json(analyzer.process_place(self))
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self):
|
||||||
|
""" A dictionary with the names of the place or None if the place
|
||||||
|
has no names.
|
||||||
|
"""
|
||||||
|
return self._info.get('name')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self):
|
||||||
|
""" A dictionary with the address elements of the place
|
||||||
|
or None if no address information is available.
|
||||||
|
"""
|
||||||
|
return self._info.get('address')
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def country_feature(self):
|
||||||
|
""" Return the country code if the place is a valid country boundary.
|
||||||
|
"""
|
||||||
|
return self._info.get('country_feature')
|
||||||
@@ -4,14 +4,16 @@ tasks.
|
|||||||
"""
|
"""
|
||||||
import functools
|
import functools
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
from psycopg2 import sql as pysql
|
from psycopg2 import sql as pysql
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
# pylint: disable=C0111
|
# pylint: disable=C0111
|
||||||
|
|
||||||
def _mk_valuelist(template, num):
|
def _mk_valuelist(template, num):
|
||||||
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
return pysql.SQL(',').join([pysql.SQL(template)] * num)
|
||||||
|
|
||||||
|
|
||||||
class AbstractPlacexRunner:
|
class AbstractPlacexRunner:
|
||||||
""" Returns SQL commands for indexing of the placex table.
|
""" Returns SQL commands for indexing of the placex table.
|
||||||
"""
|
"""
|
||||||
@@ -47,7 +49,7 @@ class AbstractPlacexRunner:
|
|||||||
for place in places:
|
for place in places:
|
||||||
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
for field in ('place_id', 'name', 'address', 'linked_place_id'):
|
||||||
values.append(place[field])
|
values.append(place[field])
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
@@ -141,7 +143,7 @@ class InterpolationRunner:
|
|||||||
values = []
|
values = []
|
||||||
for place in places:
|
for place in places:
|
||||||
values.extend((place[x] for x in ('place_id', 'address')))
|
values.extend((place[x] for x in ('place_id', 'address')))
|
||||||
values.append(psycopg2.extras.Json(self.analyzer.process_place(place)))
|
values.append(PlaceInfo(place).analyze(self.analyzer))
|
||||||
|
|
||||||
worker.perform(self._index_sql(len(places)), values)
|
worker.perform(self._index_sql(len(places)), values)
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from abc import ABC, abstractmethod
|
|||||||
from typing import List, Tuple, Dict, Any
|
from typing import List, Tuple, Dict, Any
|
||||||
|
|
||||||
from nominatim.config import Configuration
|
from nominatim.config import Configuration
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
# pylint: disable=unnecessary-pass
|
# pylint: disable=unnecessary-pass
|
||||||
|
|
||||||
@@ -105,20 +106,13 @@ class AbstractAnalyzer(ABC):
|
|||||||
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def process_place(self, place: Dict) -> Any:
|
def process_place(self, place: PlaceInfo) -> Any:
|
||||||
""" Extract tokens for the given place and compute the
|
""" Extract tokens for the given place and compute the
|
||||||
information to be handed to the PL/pgSQL processor for building
|
information to be handed to the PL/pgSQL processor for building
|
||||||
the search index.
|
the search index.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
place: Dictionary with the information about the place. Currently
|
place: Place information retrived from the database.
|
||||||
the following fields may be present:
|
|
||||||
|
|
||||||
- *name* is a dictionary of names for the place together
|
|
||||||
with the designation of the name.
|
|
||||||
- *address* is a dictionary of address terms.
|
|
||||||
- *country_feature* is set to a country code when the
|
|
||||||
place describes a country.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A JSON-serialisable structure that will be handed into
|
A JSON-serialisable structure that will be handed into
|
||||||
|
|||||||
@@ -390,18 +390,18 @@ class LegacyICUNameAnalyzer(AbstractAnalyzer):
|
|||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names = place.name
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
fulls, partials = self._compute_name_tokens(names)
|
fulls, partials = self._compute_name_tokens(names)
|
||||||
|
|
||||||
token_info.add_names(fulls, partials)
|
token_info.add_names(fulls, partials)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
country_feature = place.country_feature
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||||
self.add_country_names(country_feature.lower(), names)
|
self.add_country_names(country_feature.lower(), names)
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.address
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
|
|||||||
@@ -405,16 +405,16 @@ class LegacyNameAnalyzer(AbstractAnalyzer):
|
|||||||
"""
|
"""
|
||||||
token_info = _TokenInfo(self._cache)
|
token_info = _TokenInfo(self._cache)
|
||||||
|
|
||||||
names = place.get('name')
|
names = place.name
|
||||||
|
|
||||||
if names:
|
if names:
|
||||||
token_info.add_names(self.conn, names)
|
token_info.add_names(self.conn, names)
|
||||||
|
|
||||||
country_feature = place.get('country_feature')
|
country_feature = place.country_feature
|
||||||
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
|
||||||
self.add_country_names(country_feature.lower(), names)
|
self.add_country_names(country_feature.lower(), names)
|
||||||
|
|
||||||
address = place.get('address')
|
address = place.address
|
||||||
if address:
|
if address:
|
||||||
self._process_place_address(token_info, address)
|
self._process_place_address(token_info, address)
|
||||||
|
|
||||||
|
|||||||
@@ -7,12 +7,11 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import tarfile
|
import tarfile
|
||||||
|
|
||||||
import psycopg2.extras
|
|
||||||
|
|
||||||
from nominatim.db.connection import connect
|
from nominatim.db.connection import connect
|
||||||
from nominatim.db.async_connection import WorkerPool
|
from nominatim.db.async_connection import WorkerPool
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
LOG = logging.getLogger()
|
LOG = logging.getLogger()
|
||||||
|
|
||||||
@@ -58,7 +57,7 @@ def handle_threaded_sql_statements(pool, fd, analyzer):
|
|||||||
address = dict(street=row['street'], postcode=row['postcode'])
|
address = dict(street=row['street'], postcode=row['postcode'])
|
||||||
args = ('SRID=4326;' + row['geometry'],
|
args = ('SRID=4326;' + row['geometry'],
|
||||||
int(row['from']), int(row['to']), row['interpolation'],
|
int(row['from']), int(row['to']), row['interpolation'],
|
||||||
psycopg2.extras.Json(analyzer.process_place(dict(address=address))),
|
PlaceInfo({'address': address}).analyze(analyzer),
|
||||||
analyzer.normalize_postcode(row['postcode']))
|
analyzer.normalize_postcode(row['postcode']))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
"""
|
"""
|
||||||
Tokenizer for testing.
|
Tokenizer for testing.
|
||||||
"""
|
"""
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
def create(dsn, data_dir):
|
def create(dsn, data_dir):
|
||||||
""" Create a new instance of the tokenizer provided by this module.
|
""" Create a new instance of the tokenizer provided by this module.
|
||||||
@@ -68,4 +69,5 @@ class DummyNameAnalyzer:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def process_place(place):
|
def process_place(place):
|
||||||
|
assert isinstance(place, PlaceInfo)
|
||||||
return {}
|
return {}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ from nominatim.tokenizer.icu_name_processor import ICUNameProcessorRules
|
|||||||
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
from nominatim.tokenizer.icu_rule_loader import ICURuleLoader
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
from nominatim.db.sql_preprocessor import SQLPreprocessor
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
|
|
||||||
from mock_icu_word_table import MockIcuWordTable
|
from mock_icu_word_table import MockIcuWordTable
|
||||||
|
|
||||||
@@ -322,30 +323,37 @@ class TestPlaceNames:
|
|||||||
assert eval(info['names']) == set((t[2] for t in tokens))
|
assert eval(info['names']) == set((t[2] for t in tokens))
|
||||||
|
|
||||||
|
|
||||||
|
def process_named_place(self, names, country_feature=None):
|
||||||
|
place = {'name': names}
|
||||||
|
if country_feature:
|
||||||
|
place['country_feature'] = country_feature
|
||||||
|
|
||||||
|
return self.analyzer.process_place(PlaceInfo(place))
|
||||||
|
|
||||||
|
|
||||||
def test_simple_names(self):
|
def test_simple_names(self):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Soft bAr', 'ref': '34'}})
|
info = self.process_named_place({'name': 'Soft bAr', 'ref': '34'})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
self.expect_name_terms(info, '#Soft bAr', '#34', 'Soft', 'bAr', '34')
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('sep', [',' , ';'])
|
@pytest.mark.parametrize('sep', [',' , ';'])
|
||||||
def test_names_with_separator(self, sep):
|
def test_names_with_separator(self, sep):
|
||||||
info = self.analyzer.process_place({'name': {'name': sep.join(('New York', 'Big Apple'))}})
|
info = self.process_named_place({'name': sep.join(('New York', 'Big Apple'))})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#New York', '#Big Apple',
|
self.expect_name_terms(info, '#New York', '#Big Apple',
|
||||||
'new', 'york', 'big', 'apple')
|
'new', 'york', 'big', 'apple')
|
||||||
|
|
||||||
|
|
||||||
def test_full_names_with_bracket(self):
|
def test_full_names_with_bracket(self):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Houseboat (left)'}})
|
info = self.process_named_place({'name': 'Houseboat (left)'})
|
||||||
|
|
||||||
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
|
self.expect_name_terms(info, '#Houseboat (left)', '#Houseboat',
|
||||||
'houseboat', 'left')
|
'houseboat', 'left')
|
||||||
|
|
||||||
|
|
||||||
def test_country_name(self, word_table):
|
def test_country_name(self, word_table):
|
||||||
info = self.analyzer.process_place({'name': {'name': 'Norge'},
|
info = self.process_named_place({'name': 'Norge'}, country_feature='no')
|
||||||
'country_feature': 'no'})
|
|
||||||
|
|
||||||
self.expect_name_terms(info, '#norge', 'norge')
|
self.expect_name_terms(info, '#norge', 'norge')
|
||||||
assert word_table.get_country() == {('no', 'NORGE')}
|
assert word_table.get_country() == {('no', 'NORGE')}
|
||||||
@@ -361,7 +369,7 @@ class TestPlaceAddress:
|
|||||||
|
|
||||||
|
|
||||||
def process_address(self, **kwargs):
|
def process_address(self, **kwargs):
|
||||||
return self.analyzer.process_place({'address': kwargs})
|
return self.analyzer.process_place(PlaceInfo({'address': kwargs}))
|
||||||
|
|
||||||
|
|
||||||
def name_token_set(self, *expected_terms):
|
def name_token_set(self, *expected_terms):
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import shutil
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from nominatim.indexer.place_info import PlaceInfo
|
||||||
from nominatim.tokenizer import legacy_tokenizer
|
from nominatim.tokenizer import legacy_tokenizer
|
||||||
from nominatim.db import properties
|
from nominatim.db import properties
|
||||||
from nominatim.errors import UsageError
|
from nominatim.errors import UsageError
|
||||||
@@ -284,21 +285,21 @@ def test_add_more_country_names(analyzer, word_table, make_standard_name):
|
|||||||
|
|
||||||
|
|
||||||
def test_process_place_names(analyzer, make_keywords):
|
def test_process_place_names(analyzer, make_keywords):
|
||||||
info = analyzer.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
|
info = analyzer.process_place(PlaceInfo({'name' : {'name' : 'Soft bAr', 'ref': '34'}}))
|
||||||
|
|
||||||
assert info['names'] == '{1,2,3}'
|
assert info['names'] == '{1,2,3}'
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])
|
||||||
def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
|
def test_process_place_postcode(analyzer, create_postcode_id, word_table, pcode):
|
||||||
analyzer.process_place({'address': {'postcode' : pcode}})
|
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
|
||||||
|
|
||||||
assert word_table.get_postcodes() == {pcode, }
|
assert word_table.get_postcodes() == {pcode, }
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
|
@pytest.mark.parametrize('pcode', ['12:23', 'ab;cd;f', '123;836'])
|
||||||
def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
|
def test_process_place_bad_postcode(analyzer, create_postcode_id, word_table, pcode):
|
||||||
analyzer.process_place({'address': {'postcode' : pcode}})
|
analyzer.process_place(PlaceInfo({'address': {'postcode' : pcode}}))
|
||||||
|
|
||||||
assert not word_table.get_postcodes()
|
assert not word_table.get_postcodes()
|
||||||
|
|
||||||
@@ -319,7 +320,7 @@ class TestHousenumberName:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
@pytest.mark.parametrize('hnr', ['123a', '1', '101'])
|
||||||
def test_process_place_housenumbers_simple(analyzer, hnr):
|
def test_process_place_housenumbers_simple(analyzer, hnr):
|
||||||
info = analyzer.process_place({'address': {'housenumber' : hnr}})
|
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : hnr}}))
|
||||||
|
|
||||||
assert info['hnr'] == hnr
|
assert info['hnr'] == hnr
|
||||||
assert info['hnr_tokens'].startswith("{")
|
assert info['hnr_tokens'].startswith("{")
|
||||||
@@ -327,15 +328,15 @@ class TestHousenumberName:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_process_place_housenumbers_lists(analyzer):
|
def test_process_place_housenumbers_lists(analyzer):
|
||||||
info = analyzer.process_place({'address': {'conscriptionnumber' : '1; 2;3'}})
|
info = analyzer.process_place(PlaceInfo({'address': {'conscriptionnumber' : '1; 2;3'}}))
|
||||||
|
|
||||||
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
|
assert set(info['hnr'].split(';')) == set(('1', '2', '3'))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_process_place_housenumbers_duplicates(analyzer):
|
def test_process_place_housenumbers_duplicates(analyzer):
|
||||||
info = analyzer.process_place({'address': {'housenumber' : '134',
|
info = analyzer.process_place(PlaceInfo({'address': {'housenumber' : '134',
|
||||||
'conscriptionnumber' : '134',
|
'conscriptionnumber' : '134',
|
||||||
'streetnumber' : '99a'}})
|
'streetnumber' : '99a'}}))
|
||||||
|
|
||||||
assert set(info['hnr'].split(';')) == set(('134', '99a'))
|
assert set(info['hnr'].split(';')) == set(('134', '99a'))
|
||||||
|
|||||||
Reference in New Issue
Block a user