housenumber analyzer: avoid creating too many variants

Housenumber fields with lots of text are likely bad data. So is
data with many changes from letter to digit. Exclude them from adding
optional spaces.
This commit is contained in:
Sarah Hoffmann
2022-02-16 20:36:30 +01:00
parent f03a05f6bb
commit 13ed184efd

View File

@@ -15,17 +15,18 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
RE_NON_DIGIT = re.compile('[^0-9]') RE_NON_DIGIT = re.compile('[^0-9]')
RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])') RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)') RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
RE_NAMED_PART = re.compile(r'[a-z]{4}')
### Configuration section ### Configuration section
def configure(rules, normalization_rules): def configure(rules, normalization_rules): # pylint: disable=W0613
""" All behaviour is currently hard-coded. """ All behaviour is currently hard-coded.
""" """
return None return None
### Analysis section ### Analysis section
def create(normalizer, transliterator, config): def create(normalizer, transliterator, config): # pylint: disable=W0613
""" Create a new token analysis instance for this module. """ Create a new token analysis instance for this module.
""" """
return HousenumberTokenAnalysis(normalizer, transliterator) return HousenumberTokenAnalysis(normalizer, transliterator)
@@ -48,8 +49,14 @@ class HousenumberTokenAnalysis:
return name return name
norm = self.trans.transliterate(self.norm.transliterate(name)) norm = self.trans.transliterate(self.norm.transliterate(name))
norm = RE_DIGIT_ALPHA.sub(r'\1␣\2', norm) # If there is a significant non-numeric part, use as is.
norm = RE_ALPHA_DIGIT.sub(r'\1␣\2', norm) if RE_NAMED_PART.search(norm) is None:
# Otherwise add optional spaces between digits and letters.
(norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
(norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
# Avoid creating too many variants per number.
if cnt1 + cnt2 <= 4:
return norm_opt
return norm return norm