cache postcode normalization

This commit is contained in:
Sarah Hoffmann
2022-06-07 12:08:22 +02:00
parent b5e5efc131
commit 2eca9fc8af
2 changed files with 28 additions and 7 deletions

View File

@@ -78,6 +78,18 @@ class PostcodeFormatter:
self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern}) self.default_matcher = CountryPostcodeMatcher('', {'pattern': pattern})
def get_matcher(self, country_code):
""" Return the CountryPostcodeMatcher for the given country.
Returns None if the country doesn't have a postcode and the
default matcher if there is no specific matcher configured for
the country.
"""
if country_code in self.country_without_postcode:
return None
return self.country_matcher.get(country_code, self.default_matcher)
def match(self, country_code, postcode): def match(self, country_code, postcode):
""" Match the given postcode against the postcode pattern for this """ Match the given postcode against the postcode pattern for this
matcher. Returns a `re.Match` object if the country has a pattern matcher. Returns a `re.Match` object if the country has a pattern

View File

@@ -37,16 +37,27 @@ class _CountryPostcodesCollector:
""" Collector for postcodes of a single country. """ Collector for postcodes of a single country.
""" """
def __init__(self, country): def __init__(self, country, matcher):
self.country = country self.country = country
self.matcher = matcher
self.collected = defaultdict(PointsCentroid) self.collected = defaultdict(PointsCentroid)
self.normalization_cache = None
def add(self, postcode, x, y): def add(self, postcode, x, y):
""" Add the given postcode to the collection cache. If the postcode """ Add the given postcode to the collection cache. If the postcode
already existed, it is overwritten with the new centroid. already existed, it is overwritten with the new centroid.
""" """
self.collected[postcode] += (x, y) if self.matcher is not None:
if self.normalization_cache and self.normalization_cache[0] == postcode:
normalized = self.normalization_cache[1]
else:
match = self.matcher.match(postcode)
normalized = self.matcher.normalize(match) if match else None
self.normalization_cache = (postcode, normalized)
if normalized:
self.collected[normalized] += (x, y)
def commit(self, conn, analyzer, project_dir): def commit(self, conn, analyzer, project_dir):
@@ -193,18 +204,16 @@ def update_postcodes(dsn, project_dir, tokenizer):
if collector is None or country != collector.country: if collector is None or country != collector.country:
if collector is not None: if collector is not None:
collector.commit(conn, analyzer, project_dir) collector.commit(conn, analyzer, project_dir)
collector = _CountryPostcodesCollector(country) collector = _CountryPostcodesCollector(country, matcher.get_matcher(country))
todo_countries.discard(country) todo_countries.discard(country)
match = matcher.match(country, postcode) collector.add(postcode, x, y)
if match:
collector.add(matcher.normalize(country, match), x, y)
if collector is not None: if collector is not None:
collector.commit(conn, analyzer, project_dir) collector.commit(conn, analyzer, project_dir)
# Now handle any countries that are only in the postcode table. # Now handle any countries that are only in the postcode table.
for country in todo_countries: for country in todo_countries:
_CountryPostcodesCollector(country).commit(conn, analyzer, project_dir) _CountryPostcodesCollector(country, matcher.get_matcher(country)).commit(conn, analyzer, project_dir)
conn.commit() conn.commit()