mirror of
https://github.com/osm-search/Nominatim.git
synced 2026-03-11 13:24:07 +00:00
move filling of postcode table to python
The Python code now takes care of reading postcodes from placex, enhancing them with potentially existing external postcodes and updating location_postcodes accordingly. The initial setup and updates use exactly the same function. External postcode handling has been generalized. External postcodes for any country are now accepted. The format of the external postcode file has changed. We now expect CSV, potentially gzipped. The postcodes are no longer saved in the database.
This commit is contained in:
@@ -263,6 +263,16 @@ class LegacyICUNameAnalyzer:
|
||||
"""
|
||||
return self.normalizer.transliterate(phrase)
|
||||
|
||||
@staticmethod
|
||||
def normalize_postcode(postcode):
|
||||
""" Convert the postcode to a standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
'token_normalized_postcode()'.
|
||||
"""
|
||||
return postcode.strip().upper()
|
||||
|
||||
|
||||
@functools.lru_cache(maxsize=1024)
|
||||
def make_standard_word(self, name):
|
||||
""" Create the normalised version of the input.
|
||||
@@ -285,25 +295,44 @@ class LegacyICUNameAnalyzer:
|
||||
|
||||
return self.transliterator.transliterate(hnr)
|
||||
|
||||
def add_postcodes_from_db(self):
|
||||
""" Add postcodes from the location_postcode table to the word table.
|
||||
def update_postcodes_from_db(self):
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
table.
|
||||
"""
|
||||
to_delete = []
|
||||
copystr = io.StringIO()
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("SELECT distinct(postcode) FROM location_postcode")
|
||||
for (postcode, ) in cur:
|
||||
copystr.write(postcode)
|
||||
copystr.write('\t ')
|
||||
copystr.write(self.transliterator.transliterate(postcode))
|
||||
copystr.write('\tplace\tpostcode\t0\n')
|
||||
# This finds us the rows in location_postcode and word that are
|
||||
# missing in the other table.
|
||||
cur.execute("""SELECT * FROM
|
||||
(SELECT pc, word FROM
|
||||
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
||||
FULL JOIN
|
||||
(SELECT word FROM word
|
||||
WHERE class ='place' and type = 'postcode') w
|
||||
ON pc = word) x
|
||||
WHERE pc is null or word is null""")
|
||||
|
||||
copystr.seek(0)
|
||||
cur.copy_from(copystr, 'word',
|
||||
columns=['word', 'word_token', 'class', 'type',
|
||||
'search_name_count'])
|
||||
# Don't really need an ID for postcodes....
|
||||
# cur.execute("""UPDATE word SET word_id = nextval('seq_word')
|
||||
# WHERE word_id is null and type = 'postcode'""")
|
||||
for postcode, word in cur:
|
||||
if postcode is None:
|
||||
to_delete.append(word)
|
||||
else:
|
||||
copystr.write(postcode)
|
||||
copystr.write('\t ')
|
||||
copystr.write(self.transliterator.transliterate(postcode))
|
||||
copystr.write('\tplace\tpostcode\t0\n')
|
||||
|
||||
if to_delete:
|
||||
cur.execute("""DELETE FROM WORD
|
||||
WHERE class ='place' and type = 'postcode'
|
||||
and word = any(%s)
|
||||
""", (to_delete, ))
|
||||
|
||||
if copystr.getvalue():
|
||||
copystr.seek(0)
|
||||
cur.copy_from(copystr, 'word',
|
||||
columns=['word', 'word_token', 'class', 'type',
|
||||
'search_name_count'])
|
||||
|
||||
|
||||
def update_special_phrases(self, phrases):
|
||||
@@ -435,22 +464,25 @@ class LegacyICUNameAnalyzer:
|
||||
def _add_postcode(self, postcode):
|
||||
""" Make sure the normalized postcode is present in the word table.
|
||||
"""
|
||||
if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
|
||||
term = self.make_standard_word(postcode)
|
||||
if not term:
|
||||
return
|
||||
if re.search(r'[:,;]', postcode) is None:
|
||||
postcode = self.normalize_postcode(postcode)
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# no word_id needed for postcodes
|
||||
cur.execute("""INSERT INTO word (word, word_token, class, type,
|
||||
search_name_count)
|
||||
(SELECT pc, %s, 'place', 'postcode', 0
|
||||
FROM (VALUES (%s)) as v(pc)
|
||||
WHERE NOT EXISTS
|
||||
(SELECT * FROM word
|
||||
WHERE word = pc and class='place' and type='postcode'))
|
||||
""", (' ' + term, postcode))
|
||||
self._cache.postcodes.add(postcode)
|
||||
if postcode not in self._cache.postcodes:
|
||||
term = self.make_standard_word(postcode)
|
||||
if not term:
|
||||
return
|
||||
|
||||
with self.conn.cursor() as cur:
|
||||
# no word_id needed for postcodes
|
||||
cur.execute("""INSERT INTO word (word, word_token, class, type,
|
||||
search_name_count)
|
||||
(SELECT pc, %s, 'place', 'postcode', 0
|
||||
FROM (VALUES (%s)) as v(pc)
|
||||
WHERE NOT EXISTS
|
||||
(SELECT * FROM word
|
||||
WHERE word = pc and class='place' and type='postcode'))
|
||||
""", (' ' + term, postcode))
|
||||
self._cache.postcodes.add(postcode)
|
||||
|
||||
@staticmethod
|
||||
def _split_housenumbers(hnrs):
|
||||
|
||||
@@ -305,13 +305,51 @@ class LegacyNameAnalyzer:
|
||||
return self.normalizer.transliterate(phrase)
|
||||
|
||||
|
||||
def add_postcodes_from_db(self):
|
||||
""" Add postcodes from the location_postcode table to the word table.
|
||||
@staticmethod
|
||||
def normalize_postcode(postcode):
|
||||
""" Convert the postcode to a standardized form.
|
||||
|
||||
This function must yield exactly the same result as the SQL function
|
||||
'token_normalized_postcode()'.
|
||||
"""
|
||||
return postcode.strip().upper()
|
||||
|
||||
|
||||
def update_postcodes_from_db(self):
|
||||
""" Update postcode tokens in the word table from the location_postcode
|
||||
table.
|
||||
"""
|
||||
with self.conn.cursor() as cur:
|
||||
cur.execute("""SELECT count(create_postcode_id(pc))
|
||||
FROM (SELECT distinct(postcode) as pc
|
||||
FROM location_postcode) x""")
|
||||
# This finds us the rows in location_postcode and word that are
|
||||
# missing in the other table.
|
||||
cur.execute("""SELECT * FROM
|
||||
(SELECT pc, word FROM
|
||||
(SELECT distinct(postcode) as pc FROM location_postcode) p
|
||||
FULL JOIN
|
||||
(SELECT word FROM word
|
||||
WHERE class ='place' and type = 'postcode') w
|
||||
ON pc = word) x
|
||||
WHERE pc is null or word is null""")
|
||||
|
||||
to_delete = []
|
||||
to_add = []
|
||||
|
||||
for postcode, word in cur:
|
||||
if postcode is None:
|
||||
to_delete.append(word)
|
||||
else:
|
||||
to_add.append(postcode)
|
||||
|
||||
if to_delete:
|
||||
cur.execute("""DELETE FROM WORD
|
||||
WHERE class ='place' and type = 'postcode'
|
||||
and word = any(%s)
|
||||
""", (to_delete, ))
|
||||
if to_add:
|
||||
cur.execute("""SELECT count(create_postcode_id(pc))
|
||||
FROM unnest(%s) as pc
|
||||
""", (to_add, ))
|
||||
|
||||
|
||||
|
||||
def update_special_phrases(self, phrases):
|
||||
@@ -421,7 +459,8 @@ class LegacyNameAnalyzer:
|
||||
cur.execute('SELECT create_postcode_id(%s)', (pcode, ))
|
||||
|
||||
if re.search(r'[:,;]', postcode) is None:
|
||||
self._cache.postcodes.get(postcode.strip().upper(), _create_postcode_from_db)
|
||||
self._cache.postcodes.get(self.normalize_postcode(postcode),
|
||||
_create_postcode_from_db)
|
||||
|
||||
|
||||
class _TokenInfo:
|
||||
|
||||
Reference in New Issue
Block a user