mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-02-15 02:47:59 +00:00
housenumber analyzer: avoid creating too many variants
Housenumber fields with lots of text are likely bad data. So is data with many changes from letter to digit. Exclude them from adding optional spaces.
This commit is contained in:
@@ -15,17 +15,18 @@ from nominatim.tokenizer.token_analysis.generic_mutation import MutationVariantG
|
||||
RE_NON_DIGIT = re.compile('[^0-9]')
|
||||
RE_DIGIT_ALPHA = re.compile(r'(\d)\s*([^\d\s␣])')
|
||||
RE_ALPHA_DIGIT = re.compile(r'([^\s\d␣])\s*(\d)')
|
||||
RE_NAMED_PART = re.compile(r'[a-z]{4}')
|
||||
|
||||
### Configuration section
|
||||
|
||||
def configure(rules, normalization_rules):
|
||||
def configure(rules, normalization_rules): # pylint: disable=W0613
|
||||
""" All behaviour is currently hard-coded.
|
||||
"""
|
||||
return None
|
||||
|
||||
### Analysis section
|
||||
|
||||
def create(normalizer, transliterator, config):
|
||||
def create(normalizer, transliterator, config): # pylint: disable=W0613
|
||||
""" Create a new token analysis instance for this module.
|
||||
"""
|
||||
return HousenumberTokenAnalysis(normalizer, transliterator)
|
||||
@@ -48,8 +49,14 @@ class HousenumberTokenAnalysis:
|
||||
return name
|
||||
|
||||
norm = self.trans.transliterate(self.norm.transliterate(name))
|
||||
norm = RE_DIGIT_ALPHA.sub(r'\1␣\2', norm)
|
||||
norm = RE_ALPHA_DIGIT.sub(r'\1␣\2', norm)
|
||||
# If there is a significant non-numeric part, use as is.
|
||||
if RE_NAMED_PART.search(norm) is None:
|
||||
# Otherwise add optional spaces between digits and letters.
|
||||
(norm_opt, cnt1) = RE_DIGIT_ALPHA.subn(r'\1␣\2', norm)
|
||||
(norm_opt, cnt2) = RE_ALPHA_DIGIT.subn(r'\1␣\2', norm_opt)
|
||||
# Avoid creating too many variants per number.
|
||||
if cnt1 + cnt2 <= 4:
|
||||
return norm_opt
|
||||
|
||||
return norm
|
||||
|
||||
|
||||
Reference in New Issue
Block a user