move warm script to python code

2026-02-26 02:58:13 +00:00 · 2023-07-16 20:12:53 +02:00
parent 261e0cfd5a
commit faeee7528f
6 changed files with 53 additions and 133 deletions
--- a/nominatim/clicmd/admin.py
+++ b/nominatim/clicmd/admin.py
@@ -9,9 +9,11 @@ Implementation of the 'admin' subcommand.
 """
 import logging
 import argparse
+import random

-from nominatim.tools.exec_utils import run_legacy_script
+from nominatim.db.connection import connect
 from nominatim.clicmd.args import NominatimArgs
+import nominatim.api as napi

 # Do not repeat documentation of subcommand classes.
 # pylint: disable=C0111
@@ -81,11 +83,25 @@ class AdminFuncs:

        return 1

+
    def _warm(self, args: NominatimArgs) -> int:
        LOG.warning('Warming database caches')
-        params = ['warm.php']
-        if args.target == 'reverse':
-            params.append('--reverse-only')
-        if args.target == 'search':
-            params.append('--search-only')
-        return run_legacy_script(*params, config=args.config)
+
+        api = napi.NominatimAPI(args.project_dir)
+
+        if args.target != 'reverse':
+            for _ in range(1000):
+                api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
+                            address_details=True)
+
+        if args.target != 'search':
+            from ..tokenizer import factory as tokenizer_factory
+
+            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
+            with connect(args.config.get_libpq_dsn()) as conn:
+                words = tokenizer.most_frequent_words(conn, 1000)
+
+            for word in words:
+                api.search(word)
+
+        return 0
--- a/nominatim/tokenizer/base.py
+++ b/nominatim/tokenizer/base.py
@@ -13,6 +13,7 @@ from typing import List, Tuple, Dict, Any, Optional, Iterable
 from pathlib import Path

 from nominatim.config import Configuration
+from nominatim.db.connection import Connection
 from nominatim.data.place_info import PlaceInfo
 from nominatim.typing import Protocol

@@ -233,6 +234,13 @@ class AbstractTokenizer(ABC):
        """


+    @abstractmethod
+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+
+
 class TokenizerModule(Protocol):
    """ Interface that must be exported by modules that implement their
        own tokenizer.
--- a/nominatim/tokenizer/icu_tokenizer.py
+++ b/nominatim/tokenizer/icu_tokenizer.py
@@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
                               self.loader.make_token_analysis())


+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute("""SELECT word, sum((info->'count')::int) as count
+                             FROM word WHERE type = 'W'
+                             GROUP BY word
+                             ORDER BY count DESC LIMIT %s""", (num,))
+            return list(s[0].split('@')[0] for s in cur)
+
+
    def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
        """ Install the php script for the tokenizer.
        """
--- a/nominatim/tokenizer/legacy_tokenizer.py
+++ b/nominatim/tokenizer/legacy_tokenizer.py
@@ -256,6 +256,16 @@ class LegacyTokenizer(AbstractTokenizer):
        return LegacyNameAnalyzer(self.dsn, normalizer)


+    def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
+        """ Return a list of the `num` most frequent full words
+            in the database.
+        """
+        with conn.cursor() as cur:
+            cur.execute(""" SELECT word FROM word WHERE word is not null
+                              ORDER BY search_name_count DESC LIMIT %s""", (num,))
+            return list(s[0] for s in cur)
+
+
    def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
        """ Install the php script for the tokenizer.
        """