move warm script to python code

This commit is contained in:
Sarah Hoffmann
2023-07-16 20:12:53 +02:00
parent 261e0cfd5a
commit faeee7528f
6 changed files with 53 additions and 133 deletions

View File

@@ -9,9 +9,11 @@ Implementation of the 'admin' subcommand.
"""
import logging
import argparse
import random
from nominatim.tools.exec_utils import run_legacy_script
from nominatim.db.connection import connect
from nominatim.clicmd.args import NominatimArgs
import nominatim.api as napi
# Do not repeat documentation of subcommand classes.
# pylint: disable=C0111
@@ -81,11 +83,25 @@ class AdminFuncs:
return 1
def _warm(self, args: NominatimArgs) -> int:
LOG.warning('Warming database caches')
params = ['warm.php']
if args.target == 'reverse':
params.append('--reverse-only')
if args.target == 'search':
params.append('--search-only')
return run_legacy_script(*params, config=args.config)
api = napi.NominatimAPI(args.project_dir)
if args.target != 'reverse':
for _ in range(1000):
api.reverse((random.uniform(-90, 90), random.uniform(-180, 180)),
address_details=True)
if args.target != 'search':
from ..tokenizer import factory as tokenizer_factory
tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
with connect(args.config.get_libpq_dsn()) as conn:
words = tokenizer.most_frequent_words(conn, 1000)
for word in words:
api.search(word)
return 0

View File

@@ -13,6 +13,7 @@ from typing import List, Tuple, Dict, Any, Optional, Iterable
from pathlib import Path
from nominatim.config import Configuration
from nominatim.db.connection import Connection
from nominatim.data.place_info import PlaceInfo
from nominatim.typing import Protocol
@@ -233,6 +234,13 @@ class AbstractTokenizer(ABC):
"""
@abstractmethod
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
class TokenizerModule(Protocol):
""" Interface that must be exported by modules that implement their
own tokenizer.

View File

@@ -183,6 +183,18 @@ class ICUTokenizer(AbstractTokenizer):
self.loader.make_token_analysis())
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute("""SELECT word, sum((info->'count')::int) as count
FROM word WHERE type = 'W'
GROUP BY word
ORDER BY count DESC LIMIT %s""", (num,))
return list(s[0].split('@')[0] for s in cur)
def _install_php(self, phpdir: Path, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""

View File

@@ -256,6 +256,16 @@ class LegacyTokenizer(AbstractTokenizer):
return LegacyNameAnalyzer(self.dsn, normalizer)
def most_frequent_words(self, conn: Connection, num: int) -> List[str]:
""" Return a list of the `num` most frequent full words
in the database.
"""
with conn.cursor() as cur:
cur.execute(""" SELECT word FROM word WHERE word is not null
ORDER BY search_name_count DESC LIMIT %s""", (num,))
return list(s[0] for s in cur)
def _install_php(self, config: Configuration, overwrite: bool = True) -> None:
""" Install the php script for the tokenizer.
"""