Merge pull request #2731 from lonvia/cleanup-special-phrases

Minor code reorganisation around special phrase parsing
2026-02-26 11:08:13 +00:00 · 2022-05-31 17:13:56 +02:00
parent 60367d95dd b5ac546275
commit 12a3d51bcc
16 changed files with 145 additions and 157 deletions
--- a/nominatim/clicmd/setup.py
+++ b/nominatim/clicmd/setup.py
@@ -128,7 +128,7 @@ class SetupAll:
                                                  drop=args.no_updates)
            LOG.warning('Create search index for default country names.')
            country_info.create_country_names(conn, tokenizer,
-                                              args.config.LANGUAGES)
+                                              args.config.get_str_list('LANGUAGES'))
            if args.no_updates:
                freeze.drop_update_tables(conn)
        tokenizer.finalize_import(args.config)
--- a/nominatim/config.py
+++ b/nominatim/config.py
@@ -99,6 +99,17 @@ class Configuration:
            raise UsageError("Configuration error.") from exp


+    def get_str_list(self, name):
+        """ Return the given configuration parameter as a list of strings.
+            The values are assumed to be given as a comma-sparated list and
+            will be stripped before returning them. On empty values None
+            is returned.
+        """
+        raw = self.__getattr__(name)
+
+        return [v.strip() for v in raw.split(',')] if raw else None
+
+
    def get_path(self, name):
        """ Return the given configuration parameter as a Path.
            If a relative path is configured, then the function converts this
--- a/nominatim/tools/country_info.py
+++ b/nominatim/tools/country_info.py
@@ -131,9 +131,6 @@ def create_country_names(conn, tokenizer, languages=None):
        empty then only name translations for the given languages are added
        to the index.
    """
-    if languages:
-        languages = languages.split(',')
-
    def _include_key(key):
        return ':' not in key or not languages or \
               key[key.index(':') + 1:] in languages
--- a/nominatim/tools/special_phrases/sp_csv_loader.py
+++ b/nominatim/tools/special_phrases/sp_csv_loader.py
@@ -11,43 +11,31 @@
 """
 import csv
 import os
-from collections.abc import Iterator
 from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
 from nominatim.errors import UsageError

-class SPCsvLoader(Iterator):
+class SPCsvLoader:
    """
        Handles loading of special phrases from external csv file.
    """
    def __init__(self, csv_path):
        super().__init__()
        self.csv_path = csv_path
-        self.has_been_read = False

-    def __next__(self):
-        if self.has_been_read:
-            raise StopIteration()

-        self.has_been_read = True
-        self.check_csv_validity()
-        return self.parse_csv()
-
-    def parse_csv(self):
-        """
-            Open and parse the given csv file.
+    def generate_phrases(self):
+        """ Open and parse the given csv file.
            Create the corresponding SpecialPhrases.
        """
-        phrases = set()
+        self._check_csv_validity()

        with open(self.csv_path, encoding='utf-8') as fd:
            reader = csv.DictReader(fd, delimiter=',')
            for row in reader:
-                phrases.add(
-                    SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])
-                )
-        return phrases
+                yield SpecialPhrase(row['phrase'], row['class'], row['type'], row['operator'])

-    def check_csv_validity(self):
+
+    def _check_csv_validity(self):
        """
            Check that the csv file has the right extension.
        """
--- a/nominatim/tools/special_phrases/sp_importer.py
+++ b/nominatim/tools/special_phrases/sp_importer.py
@@ -62,11 +62,10 @@ class SPImporter():
        # Store pairs of class/type for further processing
        class_type_pairs = set()

-        for loaded_phrases in self.sp_loader:
-            for phrase in loaded_phrases:
-                result = self._process_phrase(phrase)
-                if result:
-                    class_type_pairs.add(result)
+        for phrase in self.sp_loader.generate_phrases():
+            result = self._process_phrase(phrase)
+            if result:
+                class_type_pairs.add(result)

        self._create_place_classtype_table_and_indexes(class_type_pairs)
        if should_replace:
--- a/nominatim/tools/special_phrases/sp_wiki_loader.py
+++ b/nominatim/tools/special_phrases/sp_wiki_loader.py
@@ -9,46 +9,56 @@
 """
 import re
 import logging
-from collections.abc import Iterator
 from nominatim.tools.special_phrases.special_phrase import SpecialPhrase
 from nominatim.tools.exec_utils import get_url

 LOG = logging.getLogger()
-class SPWikiLoader(Iterator):
+
+def _get_wiki_content(lang):
+    """
+        Request and return the wiki page's content
+        corresponding to special phrases for a given lang.
+        Requested URL Example :
+            https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
+    """
+    url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
+          + lang.upper()
+    return get_url(url)
+
+
+class SPWikiLoader:
    """
        Handles loading of special phrases from the wiki.
    """
-    def __init__(self, config, languages=None):
+    def __init__(self, config):
        super().__init__()
        self.config = config
        # Compile the regex here to increase performances.
        self.occurence_pattern = re.compile(
            r'\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([^\|]+) *\|\| *([\-YN])'
        )
-        self.languages = self._load_languages() if not languages else list(languages)
+        # Hack around a bug where building=yes was imported with quotes into the wiki
+        self.type_fix_pattern = re.compile(r'\"|&quot;')
+        self._load_languages()

-    def __next__(self):
-        if not self.languages:
-            raise StopIteration

-        lang = self.languages.pop(0)
-        loaded_xml = self._get_wiki_content(lang)
-        LOG.warning('Importing phrases for lang: %s...', lang)
-        return self.parse_xml(loaded_xml)
-
-    def parse_xml(self, xml):
+    def generate_phrases(self):
+        """ Download the wiki pages for the configured languages
+            and extract the phrases from the page.
        """
-            Parses XML content and extracts special phrases from it.
-            Return a list of SpecialPhrase.
-        """
-        # One match will be of format [label, class, type, operator, plural]
-        matches = self.occurence_pattern.findall(xml)
-        returned_phrases = set()
-        for match in matches:
-            returned_phrases.add(
-                SpecialPhrase(match[0], match[1], match[2], match[3])
-            )
-        return returned_phrases
+        for lang in self.languages:
+            LOG.warning('Importing phrases for lang: %s...', lang)
+            loaded_xml = _get_wiki_content(lang)
+
+            # One match will be of format [label, class, type, operator, plural]
+            matches = self.occurence_pattern.findall(loaded_xml)
+
+            for match in matches:
+                yield SpecialPhrase(match[0],
+                                    match[1],
+                                    self.type_fix_pattern.sub('', match[2]),
+                                    match[3])
+

    def _load_languages(self):
        """
@@ -56,21 +66,11 @@ class SPWikiLoader(Iterator):
            or default if there is no languages configured.
            The system will extract special phrases only from all specified languages.
        """
-        default_languages = [
+        if self.config.LANGUAGES:
+            self.languages = self.config.get_str_list('LANGUAGES')
+        else:
+            self.languages = [
            'af', 'ar', 'br', 'ca', 'cs', 'de', 'en', 'es',
            'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'hr', 'hu',
            'ia', 'is', 'it', 'ja', 'mk', 'nl', 'no', 'pl',
            'ps', 'pt', 'ru', 'sk', 'sl', 'sv', 'uk', 'vi']
-        return self.config.LANGUAGES.split(',') if self.config.LANGUAGES else default_languages
-
-    @staticmethod
-    def _get_wiki_content(lang):
-        """
-            Request and return the wiki page's content
-            corresponding to special phrases for a given lang.
-            Requested URL Example :
-                https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/EN
-        """
-        url = 'https://wiki.openstreetmap.org/wiki/Special:Export/Nominatim/Special_Phrases/' \
-              + lang.upper()
-        return get_url(url)
--- a/nominatim/tools/special_phrases/special_phrase.py
+++ b/nominatim/tools/special_phrases/special_phrase.py
@@ -10,9 +10,7 @@
    This class is a model used to transfer a special phrase through
    the process of load and importation.
 """
-import re
-
-class SpecialPhrase():
+class SpecialPhrase:
    """
        Model representing a special phrase.
    """
@@ -20,7 +18,19 @@ class SpecialPhrase():
        self.p_label = p_label.strip()
        self.p_class = p_class.strip()
        # Hack around a bug where building=yes was imported with quotes into the wiki
-        self.p_type = re.sub(r'\"|&quot;', '', p_type.strip())
+        self.p_type = p_type.strip()
        # Needed if some operator in the wiki are not written in english
        p_operator = p_operator.strip().lower()
        self.p_operator = '-' if p_operator not in ('near', 'in') else p_operator
+
+    def __eq__(self, other):
+        if not isinstance(other, SpecialPhrase):
+            return False
+
+        return self.p_label == other.p_label \
+               and self.p_class == other.p_class \
+               and self.p_type == other.p_type \
+               and self.p_operator == other.p_operator
+
+    def __hash__(self):
+        return hash((self.p_label, self.p_class, self.p_type, self.p_operator))