add support for country prefixes in postcodes

This commit is contained in:
Sarah Hoffmann
2025-03-04 15:18:27 +01:00
parent 921db8bb2f
commit 434fbbfd18

View File

@@ -7,7 +7,7 @@
""" """
Handling of arbitrary postcode tokens in tokenized query string. Handling of arbitrary postcode tokens in tokenized query string.
""" """
from typing import Tuple, Set from typing import Tuple, Set, Dict, List
import re import re
from collections import defaultdict from collections import defaultdict
@@ -31,20 +31,21 @@ class PostcodeParser:
cdata = yaml.safe_load(config.find_config_file('country_settings.yaml') cdata = yaml.safe_load(config.find_config_file('country_settings.yaml')
.read_text(encoding='utf-8')) .read_text(encoding='utf-8'))
unique_patterns = defaultdict(set) unique_patterns: Dict[str, Dict[str, List[str]]] = {}
for cc, data in cdata.items(): for cc, data in cdata.items():
if data.get('postcode'): if data.get('postcode'):
pat = data['postcode']['pattern'] pat = data['postcode']['pattern'].replace('d', '[0-9]').replace('l', '[a-z]')
out = data['postcode'].get('output') out = data['postcode'].get('output')
unique_patterns[pat.replace('d', '[0-9]').replace('l', '[a-z]')].add(out) if pat not in unique_patterns:
unique_patterns[pat] = defaultdict(list)
unique_patterns[pat][out].append(cc)
self.global_pattern = re.compile( self.global_pattern = re.compile(
'(?:' + '(?:(?P<cc>[a-z][a-z])(?P<space>[ -]?))?(?P<pc>(?:(?:'
'|'.join(f"(?:{k})" for k in unique_patterns) + ')|(?:'.join(unique_patterns) + '))[:, >].*)')
+ ')[:, >]')
self.local_patterns = [(re.compile(f"(?:{k})[:, >]"), v) self.local_patterns = [(re.compile(f"{pat}[:, >]"), list(info.items()))
for k, v in unique_patterns.items()] for pat, info in unique_patterns.items()]
def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]: def parse(self, query: qmod.QueryStruct) -> Set[Tuple[int, int, str]]:
""" Parse postcodes in the given list of query tokens taking into """ Parse postcodes in the given list of query tokens taking into
@@ -64,18 +65,22 @@ class PostcodeParser:
if word[-1] in ' -' and nodes[i + 3].btype != '`': if word[-1] in ' -' and nodes[i + 3].btype != '`':
word += nodes[i + 3].term_normalized + nodes[i + 3].btype word += nodes[i + 3].term_normalized + nodes[i + 3].btype
# Use global pattern to check for presence of any postocde. # Use global pattern to check for presence of any postcode.
m = self.global_pattern.match(word) m = self.global_pattern.fullmatch(word)
if m: if m:
# If there was a match, check against each pattern separately # If there was a match, check against each pattern separately
# because multiple patterns might be machting at the end. # because multiple patterns might be machting at the end.
cc = m.group('cc')
pc_word = m.group('pc')
cc_spaces = len(m.group('space') or '')
for pattern, info in self.local_patterns: for pattern, info in self.local_patterns:
lm = pattern.match(word) lm = pattern.match(pc_word)
if lm: if lm:
trange = (i, i + sum(c in ' ,-:>' for c in lm.group(0))) trange = (i, i + cc_spaces + sum(c in ' ,-:>' for c in lm.group(0)))
for out in info: for out, out_ccs in info:
if out: if cc is None or cc in out_ccs:
outcodes.add((*trange, lm.expand(out).upper())) if out:
else: outcodes.add((*trange, lm.expand(out).upper()))
outcodes.add((*trange, lm.group(0)[:-1].upper())) else:
outcodes.add((*trange, lm.group(0)[:-1].upper()))
return outcodes return outcodes