add Python part for new ICU-based tokenizer

2026-02-26 11:08:13 +00:00 · 2021-05-02 17:52:45 +02:00
parent 3c67bae868
commit f44af49df9
6 changed files with 1626 additions and 1 deletions
--- a/.pylintrc
+++ b/.pylintrc
@@ -10,4 +10,4 @@ ignored-modules=icu
 # closing added here because it sometimes triggers a false positive with
 # 'with' statements.
 ignored-classes=NominatimArgs,closing
-disable=too-few-public-methods
+disable=too-few-public-methods,duplicate-code
--- a/lib-sql/tokenizer/legacy_icu_tokenizer.sql
+++ b/lib-sql/tokenizer/legacy_icu_tokenizer.sql
@@ -0,0 +1,134 @@
 -- Get tokens used for searching the given place.
 --
 -- These are the tokens that will be saved in the search_name table.
 CREATE OR REPLACE FUNCTION token_get_name_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'names')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Get tokens for matching the place name against others.
 --
 -- This should usually be restricted to full name tokens.
 CREATE OR REPLACE FUNCTION token_get_name_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'names')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber tokens applicable for the place.
 CREATE OR REPLACE FUNCTION token_get_housenumber_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'hnr_tokens')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return the housenumber in the form that it can be matched during search.
 CREATE OR REPLACE FUNCTION token_normalized_housenumber(info JSONB)
  RETURNS TEXT
 AS $$
  SELECT info->>'hnr';
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_street_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'street')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_place_match_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'place_match')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_addr_place_search_tokens(info JSONB)
  RETURNS INTEGER[]
 AS $$
  SELECT (info->>'place_search')::INTEGER[]
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 DROP TYPE IF EXISTS token_addresstoken CASCADE;
 CREATE TYPE token_addresstoken AS (
  key TEXT,
  match_tokens INT[],
  search_tokens INT[]
 );
 CREATE OR REPLACE FUNCTION token_get_address_tokens(info JSONB)
  RETURNS SETOF token_addresstoken
 AS $$
  SELECT key, (value->>1)::int[] as match_tokens,
         (value->>0)::int[] as search_tokens
  FROM jsonb_each(info->'addr');
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 CREATE OR REPLACE FUNCTION token_normalized_postcode(postcode TEXT)
  RETURNS TEXT
 AS $$
  SELECT CASE WHEN postcode SIMILAR TO '%(,|;)%' THEN NULL ELSE upper(trim(postcode))END;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 -- Return token info that should be saved permanently in the database.
 CREATE OR REPLACE FUNCTION token_strip_info(info JSONB)
  RETURNS JSONB
 AS $$
  SELECT NULL::JSONB;
 $$ LANGUAGE SQL IMMUTABLE STRICT;
 --------------- private functions ----------------------------------------------
 CREATE OR REPLACE FUNCTION getorcreate_term_id(lookup_term TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  return_id INTEGER;
  term_count INTEGER;
 BEGIN
  SELECT min(word_id), max(search_name_count) INTO return_id, term_count
    FROM word WHERE word_token = lookup_term and class is null and type is null;
  IF return_id IS NULL THEN
    return_id := nextval('seq_word');
    INSERT INTO word (word_id, word_token, search_name_count)
      VALUES (return_id, lookup_term, 0);
  ELSEIF left(lookup_term, 1) = ' ' and term_count > {{ max_word_freq }} THEN
    return_id := 0;
  END IF;
  RETURN return_id;
 END;
 $$
 LANGUAGE plpgsql;
 CREATE OR REPLACE FUNCTION getorcreate_hnr_id(lookup_term TEXT)
  RETURNS INTEGER
  AS $$
 DECLARE
  return_id INTEGER;
 BEGIN
  SELECT min(word_id) INTO return_id
    FROM word
    WHERE word_token = '  '  || lookup_term
          and class = 'place' and type = 'house';
  IF return_id IS NULL THEN
    return_id := nextval('seq_word');
    INSERT INTO word (word_id, word_token, class, type, search_name_count)
      VALUES (return_id, ' ' || lookup_term, 'place', 'house', 0);
  END IF;
  RETURN return_id;
 END;
 $$
 LANGUAGE plpgsql;
--- a/lib-sql/tokenizer/legacy_tokenizer_tables.sql
+++ b/lib-sql/tokenizer/legacy_tokenizer_tables.sql
@@ -12,6 +12,8 @@ CREATE TABLE word (
 CREATE INDEX idx_word_word_token ON word
    USING BTREE (word_token) {{db.tablespace.search_index}};
 CREATE INDEX idx_word_word ON word
    USING BTREE (word) {{db.tablespace.search_index}} WHERE word is not null;
 GRANT SELECT ON word TO "{{config.DATABASE_WEBUSER}}";
 DROP SEQUENCE IF EXISTS seq_word;
--- a/nominatim/tokenizer/legacy_icu_tokenizer.py
+++ b/nominatim/tokenizer/legacy_icu_tokenizer.py
@@ -0,0 +1,594 @@
 """
 Tokenizer implementing normalisation as used before Nominatim 4 but using
 libICU instead of the PostgreSQL module.
 """
 from collections import Counter
 import io
 import itertools
 import json
 import logging
 import re
 from textwrap import dedent
 from pathlib import Path
 from icu import Transliterator
 import psycopg2.extras
 from nominatim.db.connection import connect
 from nominatim.db.properties import set_property, get_property
 from nominatim.db.sql_preprocessor import SQLPreprocessor
 DBCFG_NORMALIZATION = "tokenizer_normalization"
 DBCFG_MAXWORDFREQ = "tokenizer_maxwordfreq"
 DBCFG_TRANSLITERATION = "tokenizer_transliteration"
 DBCFG_ABBREVIATIONS = "tokenizer_abbreviations"
 LOG = logging.getLogger()
 def create(dsn, data_dir):
    """ Create a new instance of the tokenizer provided by this module.
    """
    return LegacyICUTokenizer(dsn, data_dir)
 class LegacyICUTokenizer:
    """ This tokenizer uses libICU to covert names and queries to ASCII.
        Otherwise it uses the same algorithms and data structures as the
        normalization routines in Nominatm 3.
    """
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
        self.normalization = None
        self.transliteration = None
        self.abbreviations = None
    def init_new_db(self, config, init_db=True):
        """ Set up a new tokenizer for the database.
            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
        if config.TOKENIZER_CONFIG:
            cfgfile = Path(config.TOKENIZER_CONFIG)
        else:
            cfgfile = config.config_dir / 'legacy_icu_tokenizer.json'
        rules = json.loads(cfgfile.read_text())
        self.transliteration = ';'.join(rules['normalization']) + ';'
        self.abbreviations = rules["abbreviations"]
        self.normalization = config.TERM_NORMALIZATION
        self._install_php(config)
        self._save_config(config)
        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)
    def init_from_project(self):
        """ Initialise the tokenizer from the project directory.
        """
        with connect(self.dsn) as conn:
            self.normalization = get_property(conn, DBCFG_NORMALIZATION)
            self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
            self.abbreviations = json.loads(get_property(conn, DBCFG_ABBREVIATIONS))
    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
    def update_sql_functions(self, config):
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
            max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_icu_tokenizer.sql',
                              max_word_freq=max_word_freq)
    def check_database(self):
        """ Check that the tokenizer is set up correctly.
        """
        self.init_from_project()
        if self.normalization is None\
           or self.transliteration is None\
           or self.abbreviations is None:
            return "Configuration for tokenizer 'legacy_icu' are missing."
        return None
    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:
            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```
            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.
            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        norm = Transliterator.createFromRules("normalizer", self.normalization)
        trans = Transliterator.createFromRules("normalizer", self.transliteration)
        return LegacyICUNameAnalyzer(self.dsn, norm, trans, self.abbreviations)
    def _install_php(self, config):
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(dedent("""\
            <?php
            @define('CONST_Max_Word_Frequency', {1.MAX_WORD_FREQUENCY});
            @define('CONST_Term_Normalization_Rules', "{0.normalization}");
            @define('CONST_Transliteration'. "{0.transliteration}");
            # XXX abreviations
            require_once('{1.lib_dir.php}/tokenizer/legacy_icu_tokenizer.php');
            """.format(self, config)))
    def _save_config(self, config):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        with connect(self.dsn) as conn:
            set_property(conn, DBCFG_NORMALIZATION, self.normalization)
            set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
            set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
            set_property(conn, DBCFG_ABBREVIATIONS, json.dumps(self.abbreviations))
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()
            LOG.warning("Precomputing word tokens")
            # get partial words and their frequencies
            words = Counter()
            with self.name_analyzer() as analyzer:
                with conn.cursor(name="words") as cur:
                    cur.execute("SELECT svals(name) as v, count(*) FROM place GROUP BY v")
                    for name, cnt in cur:
                        term = analyzer.make_standard_word(name)
                        if term:
                            for word in term.split():
                                words[word] += cnt
            # copy them back into the word table
            copystr = io.StringIO(''.join(('{}\t{}\n'.format(*args) for args in words.items())))
            with conn.cursor() as cur:
                cur.copy_from(copystr, 'word', columns=['word_token', 'search_name_count'])
                cur.execute("""UPDATE word SET word_id = nextval('seq_word')
                               WHERE word_id is null""")
            conn.commit()
 class LegacyICUNameAnalyzer:
    """ The legacy analyzer uses the ICU library for splitting names.
        Each instance opens a connection to the database to request the
        normalization.
    """
    def __init__(self, dsn, normalizer, transliterator, abbreviations):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        self.normalizer = normalizer
        self.transliterator = transliterator
        self.abbreviations = abbreviations
        #psycopg2.extras.register_hstore(self.conn)
        self._cache = _TokenCache()
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        self.close()
    def close(self):
        """ Free all resources used by the analyzer.
        """
        if self.conn:
            self.conn.close()
            self.conn = None
    def normalize(self, phrase):
        """ Normalize the given phrase, i.e. remove all properties that
            are irrelevant for search.
        """
        return self.normalizer.transliterate(phrase)
    def make_standard_word(self, name):
        """ Create the normalised version of the name.
        """
        norm = ' ' + self.transliterator.transliterate(name) + ' '
        for full, abbr in self.abbreviations:
            if full in norm:
                norm = norm.replace(full, abbr)
        return norm.strip()
    def _make_standard_hnr(self, hnr):
        """ Create a normalised version of a housenumber.
            This function takes minor shortcuts on transliteration.
        """
        if hnr.isdigit():
            return hnr
        return self.transliterator.transliterate(hnr)
    def add_postcodes_from_db(self):
        """ Add postcodes from the location_postcode table to the word table.
        """
        copystr = io.StringIO()
        with self.conn.cursor() as cur:
            cur.execute("SELECT distinct(postcode) FROM location_postcode")
            for (postcode, ) in cur:
                copystr.write(postcode)
                copystr.write('\t ')
                copystr.write(self.transliterator.transliterate(postcode))
                copystr.write('\tplace\tpostcode\t0\n')
            cur.copy_from(copystr, 'word',
                          columns=['word', 'word_token', 'class', 'type',
                                   'search_name_count'])
            # Don't really need an ID for postcodes....
            # cur.execute("""UPDATE word SET word_id = nextval('seq_word')
            #                WHERE word_id is null and type = 'postcode'""")
    def update_special_phrases(self, phrases):
        """ Replace the search index for special phrases with the new phrases.
        """
        norm_phrases = set(((self.normalize(p[0]), p[1], p[2], p[3])
                            for p in phrases))
        with self.conn.cursor() as cur:
            # Get the old phrases.
            existing_phrases = set()
            cur.execute("""SELECT word, class, type, operator FROM word
                           WHERE class != 'place'
                                 OR (type != 'house' AND type != 'postcode')""")
            for label, cls, typ, oper in cur:
                existing_phrases.add((label, cls, typ, oper or '-'))
            to_add = norm_phrases - existing_phrases
            to_delete = existing_phrases - norm_phrases
            if to_add:
                copystr = io.StringIO()
                for word, cls, typ, oper in to_add:
                    term = self.make_standard_word(word)
                    if term:
                        copystr.write(word)
                        copystr.write('\t ')
                        copystr.write(term)
                        copystr.write('\t')
                        copystr.write(cls)
                        copystr.write('\t')
                        copystr.write(typ)
                        copystr.write('\t')
                        copystr.write(oper if oper in ('in', 'near')  else '\\N')
                        copystr.write('\t0\n')
                cur.copy_from(copystr, 'word',
                              columns=['word', 'word_token', 'class', 'type',
                                       'operator', 'search_name_count'])
            if to_delete:
                psycopg2.extras.execute_values(
                    cur,
                    """ DELETE FROM word USING (VALUES %s) as v(name, in_class, in_type, op)
                        WHERE word = name and class = in_class and type = in_type
                              and ((op = '-' and operator is null) or op = operator)""",
                    to_delete)
        LOG.info("Total phrases: %s. Added: %s. Deleted: %s",
                 len(norm_phrases), len(to_add), len(to_delete))
    def add_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        full_names = set((self.make_standard_word(n) for n in names))
        full_names.discard('')
        self._add_normalised_country_names(country_code, full_names)
    def _add_normalised_country_names(self, country_code, names):
        """ Add names for the given country to the search index.
        """
        with self.conn.cursor() as cur:
            # Get existing names
            cur.execute("SELECT word_token FROM word WHERE country_code = %s",
                        (country_code, ))
            new_names = names.difference((t[0] for t in cur))
            if new_names:
                cur.execute("""INSERT INTO word (word_id, word_token, country_code,
                                                 search_name_count)
                               (SELECT nextval('seq_word'), token, '{}', 0
                                FROM unnest(%s) as token)
                            """.format(country_code), (list(new_names),))
    def process_place(self, place):
        """ Determine tokenizer information about the given place.
            Returns a JSON-serialisable structure that will be handed into
            the database via the token_info field.
        """
        token_info = _TokenInfo(self._cache)
        names = place.get('name')
        if names:
            full_names = set((self.make_standard_word(name) for name in names.values()))
            full_names.discard('')
            token_info.add_names(self.conn, full_names)
            country_feature = place.get('country_feature')
            if country_feature and re.fullmatch(r'[A-Za-z][A-Za-z]', country_feature):
                self._add_normalised_country_names(country_feature.lower(),
                                                   full_names)
        address = place.get('address')
        if address:
            hnrs = []
            addr_terms = []
            for key, value in address.items():
                if key == 'postcode':
                    self._add_postcode(value)
                elif key in ('housenumber', 'streetnumber', 'conscriptionnumber'):
                    hnrs.append(value)
                elif key == 'street':
                    token_info.add_street(self.conn, self.make_standard_word(value))
                elif key == 'place':
                    token_info.add_place(self.conn, self.make_standard_word(value))
                elif not key.startswith('_') and \
                     key not in ('country', 'full'):
                    addr_terms.append((key, self.make_standard_word(value)))
            if hnrs:
                hnrs = self._split_housenumbers(hnrs)
                token_info.add_housenumbers(self.conn, [self._make_standard_hnr(n) for n in hnrs])
            if addr_terms:
                token_info.add_address_terms(self.conn, addr_terms)
        return token_info.data
    def _add_postcode(self, postcode):
        """ Make sure the normalized postcode is present in the word table.
        """
        if re.search(r'[:,;]', postcode) is None and not postcode in self._cache.postcodes:
            term = self.make_standard_word(postcode)
            if not term:
                return
            with self.conn.cursor() as cur:
                # no word_id needed for postcodes
                cur.execute("""INSERT INTO word (word, word_token, class, type,
                                                 search_name_count)
                               (SELECT pc, %s, 'place', 'postcode', 0
                                FROM (VALUES (%s)) as v(pc)
                                WHERE NOT EXISTS
                                 (SELECT * FROM word
                                  WHERE word = pc and class='place' and type='postcode'))
                            """, (' ' + term, postcode))
            self._cache.postcodes.add(postcode)
    @staticmethod
    def _split_housenumbers(hnrs):
        if len(hnrs) > 1 or ',' in hnrs[0] or ';' in hnrs[0]:
            # split numbers if necessary
            simple_list = []
            for hnr in hnrs:
                simple_list.extend((x.strip() for x in re.split(r'[;,]', hnr)))
            if len(simple_list) > 1:
                hnrs = list(set(simple_list))
            else:
                hnrs = simple_list
        return hnrs
 class _TokenInfo:
    """ Collect token information to be sent back to the database.
    """
    def __init__(self, cache):
        self.cache = cache
        self.data = {}
    @staticmethod
    def _mk_array(tokens):
        return '{%s}' % ','.join((str(s) for s in tokens))
    def add_names(self, conn, names):
        """ Adds token information for the normalised names.
        """
        # Start with all partial names
        terms = set((part for ns in names for part in ns.split()))
        # Add partials for the full terms (TO BE REMOVED)
        terms.update((n for n in names))
        # Add the full names
        terms.update((' ' + n for n in names))
        self.data['names'] = self._mk_array(self.cache.get_term_tokens(conn, terms))
    def add_housenumbers(self, conn, hnrs):
        """ Extract housenumber information from a list of normalised
            housenumbers.
        """
        self.data['hnr_tokens'] = self._mk_array(self.cache.get_hnr_tokens(conn, hnrs))
        self.data['hnr'] = ';'.join(hnrs)
    def add_street(self, conn, street):
        """ Add addr:street match terms.
        """
        if not street:
            return
        term = ' ' + street
        tid = self.cache.names.get(term)
        if tid is None:
            with conn.cursor() as cur:
                cur.execute("""SELECT word_id FROM word
                                WHERE word_token = %s
                                      and class is null and type is null""",
                            (term, ))
                if cur.rowcount > 0:
                    tid = cur.fetchone()[0]
                    self.cache.names[term] = tid
        if tid is not None:
            self.data['street'] = '{%d}' % tid
    def add_place(self, conn, place):
        """ Add addr:place search and match terms.
        """
        if not place:
            return
        partial_ids = self.cache.get_term_tokens(conn, place.split())
        tid = self.cache.get_term_tokens(conn, [' ' + place])
        self.data['place_search'] = self._mk_array(itertools.chain(partial_ids, tid))
        self.data['place_match'] = '{%s}' % tid[0]
    def add_address_terms(self, conn, terms):
        """ Add additional address terms.
        """
        tokens = {}
        for key, value in terms:
            if not value:
                continue
            partial_ids = self.cache.get_term_tokens(conn, value.split())
            term = ' ' + value
            tid = self.cache.names.get(term)
            if tid is None:
                with conn.cursor() as cur:
                    cur.execute("""SELECT word_id FROM word
                                    WHERE word_token = %s
                                          and class is null and type is null""",
                                (term, ))
                    if cur.rowcount > 0:
                        tid = cur.fetchone()[0]
                        self.cache.names[term] = tid
            tokens[key] = [self._mk_array(partial_ids),
                           '{%s}' % ('' if tid is None else str(tid))]
        if tokens:
            self.data['addr'] = tokens
 class _TokenCache:
    """ Cache for token information to avoid repeated database queries.
        This cache is not thread-safe and needs to be instantiated per
        analyzer.
    """
    def __init__(self):
        self.names = {}
        self.postcodes = set()
        self.housenumbers = {}
    def get_term_tokens(self, conn, terms):
        """ Get token ids for a list of terms, looking them up in the database
            if necessary.
        """
        tokens = []
        askdb = []
        for term in terms:
            token = self.names.get(term)
            if token is None:
                askdb.append(term)
            elif token != 0:
                tokens.append(token)
        if askdb:
            with conn.cursor() as cur:
                cur.execute("SELECT term, getorcreate_term_id(term) FROM unnest(%s) as term",
                            (askdb, ))
                for term, tid in cur:
                    self.names[term] = tid
                    if tid != 0:
                        tokens.append(tid)
        return tokens
    def get_hnr_tokens(self, conn, terms):
        """ Get token ids for a list of housenumbers, looking them up in the
            database if necessary.
        """
        tokens = []
        askdb = []
        for term in terms:
            token = self.housenumbers.get(term)
            if token is None:
                askdb.append(term)
            else:
                tokens.append(token)
        if askdb:
            with conn.cursor() as cur:
                cur.execute("SELECT nr, getorcreate_hnr_id(nr) FROM unnest(%s) as nr",
                            (askdb, ))
                for term, tid in cur:
                    self.housenumbers[term] = tid
                    tokens.append(tid)
        return tokens
--- a/settings/env.defaults
+++ b/settings/env.defaults
@@ -46,6 +46,12 @@ NOMINATIM_LANGUAGES=
 # Changing this value requires a reimport.
 NOMINATIM_TERM_NORMALIZATION=":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"
 # Configuration file for the tokenizer.
 # The content depends on the tokenizer used. If left empty the default settings
 # for the chooseen tokenizer will be used. The configuration can only be set
 # on import and not be changed afterwards.
 NOMINATIM_TOKENIZER_CONFIG=
 # Search in the Tiger house number data for the US.
 # Note: The tables must already exist or queries will throw errors.
 # Changing this value requires to run ./utils/setup --create-functions --setup-website.
--- a/settings/legacy_icu_tokenizer.json
+++ b/settings/legacy_icu_tokenizer.json
@@ -0,0 +1,889 @@
 { "normalization": [ ":: Latin ()",
                     ":: Ascii ()",
                     ":: NFD ()",
                     "'' >",
                     "[[:Nonspacing Mark:] [:Cf:]] >",
                     "[^[:Ascii:]] >",
                     ":: lower ()",
                     "[[:Punctuation:][:Space:]]+ > ' '",
                     ":: NFC ()"
                   ],
  "abbreviations": [
    [" national wildlife refuge area ", " nwra "],
    [" national recreation area ", " nra "],
    [" air national guard base ", " angb "],
    [" zhilishchien komplieks ", " zh k "],
    [" trung tam thuong mdhi ", " tttm "],
    [" poligono industrial ", " pgind "],
    [" trung hoc pho thong ", " thpt "],
    [" onze lieve vrouw e ", " olv "],
    [" strada provinciale ", " sp "],
    ["onze lieve vrouw e ", " olv "],
    [" punto kilometrico ", " pk "],
    [" cong vien van hoa ", " cvvh "],
    [" can cu khong quan ", " cckq "],
    ["strada provinciale ", " sp "],
    [" strada regionale ", " sr "],
    [" strada comunale ", " sc "],
    ["strada regionale ", " sr "],
    [" trung hoc co so ", " thcs "],
    [" san bay quoc te ", " sbqt "],
    [" cong ty co phyn ", " ctcp "],
    [" khu cong nghiep ", " kcn "],
    [" air force base ", " afb "],
    [" strada statale ", " ss "],
    [" vien bcyo tang ", " vbt "],
    ["strada comunale ", " sc "],
    [" circunvalacion ", " ccvcn "],
    [" paseo maritimo ", " psmar "],
    [" wielkopolskie ", " wlkp "],
    [" national park ", " np "],
    [" middle school ", " ms "],
    [" international ", " intl "],
    [" burgermeister ", " bgm "],
    [" vuon quoc gia ", " vqg "],
    [" qucyng truong ", " qt "],
    ["strada statale ", " ss "],
    [" state highway ", " sh "],
    ["burgermeister ", " bgm "],
    [" right of way ", " rowy "],
    [" hauptbahnhof ", " hbf "],
    [" apartamentos ", " aptos "],
    [" wielkopolski ", " wlkp "],
    [" burgemeester ", " bg "],
    [" camino nuevo ", " c n "],
    [" camino hondo ", " c h "],
    [" urbanizacion ", " urb "],
    [" camino viejo ", " c v "],
    [" wielkopolska ", " wlkp "],
    [" wojewodztwie ", " woj "],
    [" county route ", " cr "],
    [" prolongacion ", " prol "],
    [" thoroughfare ", " thor "],
    [" san van dong ", " svd "],
    [" tong cong ty ", " tct "],
    [" khu nghi mat ", " knm "],
    [" nha thi dzu ", " ntd "],
    [" khu du lich ", " kdl "],
    [" demarcacion ", " demar "],
    [" cau ldhc bo ", " clb "],
    [" interchange ", " intg "],
    [" distributor ", " dstr "],
    [" state route ", " sr "],
    [" wojewodztwo ", " woj "],
    [" reservation ", " res "],
    [" monseigneur ", " mgr "],
    [" transversal ", " trval "],
    [" extrarradio ", " extrr "],
    [" high school ", " hs "],
    [" mazowieckie ", " maz "],
    [" residencial ", " resid "],
    [" cong truong ", " ct "],
    [" cooperativa ", " coop "],
    [" diseminado ", " disem "],
    [" barranquil ", " bqllo "],
    [" fire track ", " ftrk "],
    [" south east ", " se "],
    [" north east ", " ne "],
    [" university ", " univ "],
    [" south west ", " sw "],
    [" monasterio ", " mtrio "],
    [" vecindario ", " vecin "],
    [" carreterin ", " ctrin "],
    [" callejuela ", " cjla "],
    [" north-east ", " ne "],
    [" south-west ", " sw "],
    [" gebroeders ", " gebr "],
    [" serviceway ", " swy "],
    [" quadrangle ", " qdgl "],
    [" commandant ", " cmdt "],
    [" extramuros ", " extrm "],
    [" escalinata ", " escal "],
    [" north-west ", " n "],
    [" bulevardul ", " bd "],
    [" particular ", " parti "],
    [" mazowiecka ", " maz "],
    [" mazowiecki ", " maz "],
    [" north west ", " n "],
    [" industrial ", " ind "],
    [" costanilla ", " cstan "],
    [" khach sdhn ", " ks "],
    [" south-east ", " se "],
    [" phi truong ", " pt "],
    [" expressway ", " exp "],
    [" fondamenta ", " f ta "],
    [" apartments ", " apts "],
    [" cul de sac ", " cds "],
    [" corralillo ", " crrlo "],
    [" mitropolit ", " mit "],
    [" etorbidea ", " etorb "],
    [" ploshchad ", " pl "],
    [" cobertizo ", " cbtiz "],
    [" underpass ", " upas "],
    [" crossroad ", " crd "],
    [" fundatura ", " fnd "],
    [" foreshore ", " fshr "],
    [" parklands ", " pkld "],
    [" esplanade ", " esp "],
    [" centreway ", " cnwy "],
    [" formation ", " form "],
    [" explanada ", " expla "],
    [" viviendas ", " vvdas "],
    [" northeast ", " ne "],
    [" cong vien ", " cv "],
    [" northwest ", " n "],
    [" buildings ", " bldgs "],
    [" errepidea ", " err "],
    [" extension ", " ex "],
    [" municipal ", " mun "],
    [" southeast ", " se "],
    [" sanatorio ", " sanat "],
    [" thanh pho ", " tp "],
    [" firetrail ", " fit "],
    [" santuario ", " santu "],
    [" southwest ", " sw "],
    [" autopista ", " auto "],
    [" president ", " pres "],
    [" rinconada ", " rcda "],
    [" kardinaal ", " kard "],
    [" plazoleta ", " pzta "],
    [" duong sat ", " ds "],
    [" trung tam ", " tt "],
    [" piazzetta ", " pta "],
    [" boardwalk ", " bwlk "],
    [" bulievard ", " bd "],
    [" luitenant ", " luit "],
    [" courtyard ", " ctyd "],
    [" reservoir ", " res "],
    [" bulevardu ", " bd "],
    [" community ", " comm "],
    [" concourse ", " con "],
    [" profiesor ", " prof "],
    [" promenade ", " prom "],
    [" gienieral ", " ghien "],
    [" puistikko ", " pko "],
    [" balneario ", " balnr "],
    [" carretera ", " ctra "],
    [" ingenieur ", " ir "],
    [" boulevard ", " bd "],
    [" deviation ", " devn "],
    [" hipodromo ", " hipod "],
    [" professor ", " prof "],
    [" triangle ", " tri "],
    [" dotsient ", " dots "],
    [" boundary ", " bdy "],
    [" salizada ", " s da "],
    [" trunkway ", " tkwy "],
    [" cinturon ", " cint "],
    ["president ", " pres "],
    [" military ", " mil "],
    [" jonkheer ", " jhr "],
    [" motorway ", " mwy "],
    [" steenweg ", " stwg "],
    [" crescent ", " cr "],
    [" kanunnik ", " kan "],
    [" koningin ", " kon "],
    [" crossing ", " xing "],
    [" callejon ", " cjon "],
    [" pasadizo ", " pzo "],
    [" crossway ", " cowy "],
    [" cottages ", " cotts "],
    [" mountain ", " mtn "],
    [" business ", " bus "],
    [" pierwszy ", " 1 "],
    [" pierwsza ", " 1 "],
    [" pierwsze ", " 1 "],
    [" barriada ", " barda "],
    [" entrance ", " ent "],
    [" causeway ", " cway "],
    [" generaal ", " gen "],
    [" driveway ", " dvwy "],
    [" township ", " twp "],
    [" stazione ", " staz "],
    [" broadway ", " bway "],
    [" alleyway ", " alwy "],
    [" quadrant ", " qdrt "],
    [" apeadero ", " apdro "],
    [" arboleda ", " arb "],
    [" escalera ", " esca "],
    [" rdhp hat ", " rh "],
    [" transito ", " trans "],
    [" ddhi hoc ", " dh "],
    [" travesia ", " trva "],
    [" barranco ", " branc "],
    [" namestie ", " nam "],
    [" viaducto ", " vcto "],
    [" convento ", " cnvto "],
    [" estacion ", " estcn "],
    ["puistikko ", " pko "],
    [" precinct ", " pct "],
    [" heiligen ", " hl "],
    [" edificio ", " edifc "],
    [" prazuela ", " przla "],
    [" thi trzn ", " tt "],
    [" ridgeway ", " rgwy "],
    [" riverway ", " rvwy "],
    [" corredor ", " crrdo "],
    [" passatge ", " ptge "],
    [" junction ", " jnc "],
    [" hospital ", " hosp "],
    [" highroad ", " hrd "],
    [" torrente ", " trrnt "],
    [" avinguda ", " av "],
    [" portillo ", " ptilo "],
    [" diagonal ", " diag "],
    [" buu dien ", " bd "],
    [" alqueria ", " alque "],
    [" poligono ", " polig "],
    [" roadside ", " rdsd "],
    [" glorieta ", " gta "],
    [" fundacul ", " fdc "],
    [" cao dang ", " cd "],
    [" rosebowl ", " rsbl "],
    [" complejo ", " compj "],
    [" carretil ", " crtil "],
    [" intrarea ", " int "],
    [" gran via ", " g v "],
    [" approach ", " app "],
    [" stradela ", " sdla "],
    [" conjunto ", " cjto "],
    [" arterial ", " artl "],
    [" plazuela ", " plzla "],
    [" frontage ", " frtg "],
    [" faubourg ", " fg "],
    [" mansions ", " mans "],
    [" turnpike ", " tpk "],
    [" piazzale ", " p le "],
    [" tieu hoc ", " th "],
    [" bulevard ", " bd "],
    [" sendera ", " sedra "],
    [" cutting ", " cutt "],
    [" cantina ", " canti "],
    [" cantera ", " cantr "],
    [" rotonda ", " rtda "],
    [" pasillo ", " psllo "],
    [" landing ", " ldg "],
    [" kolonel ", " kol "],
    [" cong ty ", " cty "],
    [" fairway ", " fawy "],
    [" highway ", " hwy "],
    [" lookout ", " lkt "],
    [" meander ", " mr "],
    [" carrera ", " cra "],
    [" station ", " stn "],
    [" kapitan ", " kap "],
    [" medical ", " med "],
    [" broeder ", " br "],
    [" poblado ", " pbdo "],
    [" impasse ", " imp "],
    [" gardens ", " gdn "],
    [" nha tho ", " nt "],
    [" nha hat ", " nh "],
    [" freeway ", " fwy "],
    [" trasera ", " tras "],
    [" portico ", " prtco "],
    [" terrace ", " ter "],
    [" heights ", " hts "],
    [" camping ", " campg "],
    [" callizo ", " cllzo "],
    [" footway ", " ftwy "],
    [" calzada ", " czada "],
    [" dominee ", " ds "],
    [" meadows ", " mdws "],
    [" sendero ", " send "],
    [" osiedle ", " os "],
    [" estrada ", " estda "],
    [" avenida ", " av "],
    [" zgornji ", " zg "],
    [" zgornje ", " zg "],
    [" zgornja ", " zg "],
    [" arrabal ", " arral "],
    [" espalda ", " eslda "],
    [" entrada ", " entd "],
    [" kleiner ", " kl "],
    [" kleines ", " kl "],
    [" viaduct ", " via "],
    [" roadway ", " rdwy "],
    [" strasse ", " st "],
    [" spodnje ", " sp "],
    [" spodnji ", " sp "],
    [" spodnja ", " sp "],
    [" fabrica ", " fca "],
    [" muntele ", " mt "],
    [" maantee ", " mt "],
    [" srednje ", " sr "],
    [" unterer ", " u "],
    [" unteres ", " u "],
    [" plateau ", " plat "],
    [" srednji ", " sr "],
    [" empresa ", " empr "],
    [" angosta ", " angta "],
    [" costera ", " coste "],
    [" tinh lo ", " tl "],
    [" quoc lo ", " ql "],
    [" auf der ", " a d "],
    [" bulvari ", " bl "],
    [" ddhi lo ", " dl "],
    [" namesti ", " nam "],
    [" passeig ", " pg "],
    [" carrero ", " cro "],
    [" cortijo ", " crtjo "],
    [" san bay ", " sb "],
    [" riviera ", " rvra "],
    [" caddesi ", " cd "],
    [" andador ", " andad "],
    [" walkway ", " wkwy "],
    [" granden ", " gr "],
    [" grosser ", " gr "],
    [" grosses ", " gr "],
    [" reserve ", " res "],
    [" alameda ", " alam "],
    [" retreat ", " rtt "],
    [" acequia ", " aceq "],
    [" platsen ", " pl "],
    [" bahnhof ", " bf "],
    [" autovia ", " autov "],
    [" srednja ", " sr "],
    [" galeria ", " gale "],
    [" circuit ", " cct "],
    [" svingen ", " sv "],
    [" plassen ", " pl "],
    [" mirador ", " mrdor "],
    [" laneway ", " lnwy "],
    [" kolonia ", " kol "],
    [" outlook ", " otlk "],
    [" caravan ", " cvn "],
    [" osiedlu ", " os "],
    [" palacio ", " palac "],
    [" pantano ", " pant "],
    [" partida ", " ptda "],
    [" calleja ", " cllja "],
    [" mevrouw ", " mevr "],
    [" meester ", " mr "],
    [" pastoor ", " past "],
    [" prinses ", " pr "],
    [" bulevar ", " bd "],
    [" tollway ", " tlwy "],
    ["steenweg ", " stwg "],
    [" caserio ", " csrio "],
    [" mercado ", " merc "],
    [" alejach ", " al "],
    [" kvartal ", " kv "],
    [" parkway ", " pwy "],
    [" passage ", " ps "],
    [" pathway ", " pway "],
    [" splaiul ", " sp "],
    [" soseaua ", " sos "],
    [" colonia ", " col "],
    [" wielkie ", " wlk "],
    [" trzecie ", " 3 "],
    [" llanura ", " llnra "],
    [" malecon ", " malec "],
    [" trzecia ", " 3 "],
    [" trailer ", " trlr "],
    [" cuadra ", " cuadr "],
    [" cty cp ", " ctcp "],
    [" paraje ", " praje "],
    [" parque ", " pque "],
    [" piazza ", " p za "],
    [" puerta ", " pta "],
    [" little ", " lt "],
    [" pueblo ", " pblo "],
    [" puente ", " pnte "],
    [" jardin ", " jdin "],
    [" granja ", " granj "],
    [" market ", " mkt "],
    [" pasaje ", " psaje "],
    [" rotary ", " rty "],
    [" corral ", " crral "],
    [" siding ", " sdng "],
    [" nucleo ", " ncleo "],
    [" muelle ", " muell "],
    [" carril ", " crril "],
    [" portal ", " prtal "],
    [" ramble ", " rmbl "],
    [" pocket ", " pkt "],
    [" chalet ", " chlet "],
    [" canton ", " cant "],
    [" ladera ", " ldera "],
    [" parade ", " pde "],
    [" dehesa ", " dhsa "],
    [" museum ", " mus "],
    [" middle ", " mid "],
    [" cuesta ", " custa "],
    [" gracht ", " gr "],
    [" virful ", " vf "],
    [" m tele ", " mt "],
    [" varful ", " vf "],
    [" str la ", " sdla "],
    [" arcade ", " arc "],
    [" strada ", " st "],
    [" access ", " accs "],
    [" bajada ", " bjada "],
    [" veliki ", " v "],
    ["strasse ", " st "],
    [" velike ", " v "],
    [" untere ", " u "],
    [" velika ", " v "],
    [" artery ", " arty "],
    [" avenue ", " av "],
    [" miasto ", " m "],
    [" bypass ", " byp "],
    [" placem ", " pl "],
    [" barrio ", " bo "],
    [" center ", " ctr "],
    [" bldngs ", " bldgs "],
    [" puerto ", " pto "],
    [" wielka ", " wlk "],
    [" tunnel ", " tun "],
    [" wielki ", " wlk "],
    [" bridge ", " bri "],
    [" trzeci ", " 3 "],
    [" veliko ", " v "],
    [" quelle ", " qu "],
    [" acceso ", " acces "],
    [" bulvar ", " bl "],
    [" sokagi ", " sk "],
    ["platsen ", " pl "],
    [" stigen ", " st "],
    [" brucke ", " br "],
    [" an der ", " a d "],
    [" thi xa ", " tx "],
    [" nordre ", " ndr "],
    [" rambla ", " rbla "],
    [" sondre ", " sdr "],
    ["quoc lo ", " ql "],
    [" phuong ", " p "],
    [" vastra ", " v "],
    [" carrer ", " c "],
    [" oberes ", " o "],
    [" raitti ", " r "],
    [" puisto ", " ps "],
    [" arroyo ", " arry "],
    [" penger ", " pgr "],
    [" oberer ", " o "],
    [" kleine ", " kl "],
    [" grosse ", " gr "],
    ["granden ", " gr "],
    [" villas ", " vlls "],
    [" taival ", " tvl "],
    [" in der ", " i d "],
    [" centre ", " ctr "],
    [" drugie ", " 2 "],
    [" dokter ", " dr "],
    [" grange ", " gra "],
    [" doctor ", " dr "],
    [" vicolo ", " v lo "],
    [" kort e ", " k "],
    [" koning ", " kon "],
    [" straat ", " st "],
    [" svieti ", " sv "],
    [" callej ", " cjon "],
    [" ground ", " grnd "],
    [" vereda ", " vreda "],
    [" chemin ", " ch "],
    [" street ", " st "],
    [" strand ", " st "],
    [" sainte ", " ste "],
    [" camino ", " cno "],
    [" garden ", " gdn "],
    [" follow ", " folw "],
    [" estate ", " est "],
    [" doktor ", " d r "],
    [" subway ", " sbwy "],
    [" ulitsa ", " ul "],
    [" square ", " sq "],
    [" towers ", " twrs "],
    ["plassen ", " pl "],
    [" county ", " co "],
    [" brazal ", " brzal "],
    [" circus ", " crcs "],
    ["svingen ", " sv "],
    [" rampla ", " rampa "],
    [" bloque ", " blque "],
    [" circle ", " cir "],
    [" island ", " is "],
    [" common ", " comm "],
    [" ribera ", " rbra "],
    [" sector ", " sect "],
    [" rincon ", " rcon "],
    [" van de ", " vd "],
    [" corner ", " cnr "],
    [" subida ", " sbida "],
    [" banda ", " b "],
    [" bulev ", " bd "],
    [" barro ", " bo "],
    [" cllon ", " cjon "],
    [" p zza ", " p za "],
    [" drugi ", " 2 "],
    [" druga ", " 2 "],
    [" placu ", " pl "],
    [" aleji ", " al "],
    [" aleja ", " al "],
    [" aleje ", " al "],
    [" stary ", " st "],
    [" stara ", " st "],
    [" dolny ", " dln "],
    [" dolna ", " dln "],
    [" gorne ", " gn "],
    [" gorna ", " gn "],
    [" stare ", " st "],
    [" gorny ", " gn "],
    [" ulicy ", " ul "],
    [" ulica ", " ul "],
    [" o l v ", " olv "],
    [" plein ", " pln "],
    [" markt ", " mkt "],
    [" lange ", " l "],
    [" viale ", " v le "],
    ["gracht ", " gr "],
    [" prins ", " pr "],
    ["straat ", " st "],
    [" plass ", " pl "],
    [" sving ", " sv "],
    [" gaten ", " g "],
    [" veien ", " v "],
    [" vliet ", " vlt "],
    [" dolne ", " dln "],
    [" b dul ", " bd "],
    [" sodra ", " s "],
    [" norra ", " n "],
    [" gamla ", " gla "],
    [" grand ", " gr "],
    [" vagen ", " v "],
    [" gatan ", " g "],
    [" ostra ", " o "],
    ["vastra ", " v "],
    [" cadde ", " cd "],
    [" duong ", " d "],
    [" sokak ", " sk "],
    [" plats ", " pl "],
    ["stigen ", " st "],
    [" vayla ", " vla "],
    ["taival ", " tvl "],
    [" sveti ", " sv "],
    [" aukio ", " auk "],
    [" sveta ", " sv "],
    [" cesta ", " c "],
    [" piata ", " pta "],
    [" aleea ", " al "],
    [" kaari ", " kri "],
    ["penger ", " pgr "],
    [" ranta ", " rt "],
    [" rinne ", " rn "],
    ["raitti ", " r "],
    ["puisto ", " ps "],
    [" polku ", " p "],
    [" porta ", " pta "],
    [" ponte ", " p te "],
    [" paseo ", " po "],
    [" fbrca ", " fca "],
    [" allee ", " al "],
    [" cours ", " crs "],
    ["sainte ", " ste "],
    ["square ", " sq "],
    [" largo ", " l go "],
    [" wharf ", " whrf "],
    [" corte ", " c te "],
    [" corso ", " c so "],
    [" campo ", " c po "],
    [" santa ", " sta "],
    [" calle ", " c "],
    [" strip ", " strp "],
    [" alley ", " al "],
    [" north ", " n "],
    [" block ", " blk "],
    [" gully ", " gly "],
    [" sielo ", " s "],
    [" brace ", " br "],
    [" ronde ", " rnde "],
    [" grove ", " gr "],
    [" break ", " brk "],
    [" roads ", " rds "],
    [" track ", " trk "],
    [" house ", " ho "],
    [" trail ", " trl "],
    [" mount ", " mt "],
    [" cross ", " crss "],
    [" beach ", " bch "],
    [" point ", " pt "],
    [" basin ", " basn "],
    [" green ", " gn "],
    [" plaza ", " pl "],
    [" lille ", " ll "],
    [" slope ", " slpe "],
    [" placa ", " pl "],
    [" place ", " pl "],
    [" shunt ", " shun "],
    [" saint ", " st "],
    [" ulice ", " ul "],
    [" amble ", " ambl "],
    [" route ", " rt "],
    [" sound ", " snd "],
    [" store ", " st "],
    [" front ", " frnt "],
    [" elbow ", " elb "],
    [" glade ", " gl "],
    [" south ", " s "],
    [" round ", " rnd "],
    [" drive ", " dr "],
    [" croft ", " cft "],
    [" platz ", " pl "],
    [" ferry ", " fy "],
    [" ridge ", " rdge "],
    [" tanav ", " tn "],
    [" banan ", " ba "],
    [" quays ", " qys "],
    [" sankt ", " st "],
    [" vkhod ", " vkh "],
    [" chase ", " ch "],
    [" vista ", " vsta "],
    [" rhein ", " rh "],
    [" court ", " ct "],
    ["brucke ", " br "],
    [" upper ", " up "],
    [" river ", " r "],
    [" range ", " rnge "],
    [" lower ", " lr "],
    [" kalea ", " k "],
    [" crest ", " crst "],
    [" obere ", " o "],
    [" manor ", " mnr "],
    [" byway ", " bywy "],
    [" reach ", " rch "],
    [" copse ", " cps "],
    ["quelle ", " qu "],
    [" creek ", " cr "],
    [" close ", " c "],
    [" fort ", " ft "],
    [" apch ", " app "],
    [" mont ", " mt "],
    [" bdul ", " bd "],
    ["saint ", " st "],
    [" back ", " bk "],
    [" c le ", " c "],
    ["place ", " pl "],
    [" frwy ", " fwy "],
    [" quai ", " qu "],
    [" ally ", " al "],
    [" m te ", " mt "],
    [" lane ", " ln "],
    ["aukio ", " auk "],
    [" loop ", " lp "],
    [" line ", " ln "],
    [" alue ", " al "],
    [" link ", " lk "],
    [" glde ", " gl "],
    [" alea ", " al "],
    [" gate ", " g "],
    [" intr ", " int "],
    [" gdns ", " gdn "],
    [" hird ", " hrd "],
    [" varf ", " vf "],
    [" virf ", " vf "],
    [" hgts ", " hts "],
    [" expy ", " exp "],
    ["markt ", " mkt "],
    [" bypa ", " byp "],
    ["o l v ", " olv "],
    [" cres ", " cr "],
    [" bdwy ", " bway "],
    [" csac ", " cds "],
    [" nowy ", " n "],
    [" laan ", " ln "],
    [" crsg ", " xing "],
    ["vliet ", " vlt "],
    [" city ", " cty "],
    ["sving ", " sv "],
    ["plass ", " pl "],
    ["gaten ", " g "],
    ["veien ", " v "],
    [" gata ", " g "],
    [" sint ", " st "],
    [" caus ", " cway "],
    [" cove ", " cv "],
    ["plein ", " pln "],
    [" cswy ", " cway "],
    [" plac ", " pl "],
    [" nowa ", " n "],
    [" kolo ", " k "],
    [" katu ", " k "],
    [" duze ", " dz "],
    [" blvd ", " bd "],
    [" p ta ", " pta "],
    [" maly ", " ml "],
    [" mala ", " ml "],
    [" bdge ", " bri "],
    [" nowe ", " n "],
    [" brdg ", " bri "],
    [" male ", " ml "],
    [" drwy ", " dvwy "],
    [" duza ", " dz "],
    [" utca ", " u "],
    [" east ", " e "],
    [" duzy ", " dz "],
    ["kaari ", " kri "],
    [" quan ", " q "],
    [" svwy ", " swy "],
    [" shwy ", " sh "],
    [" road ", " rd "],
    ["sankt ", " st "],
    [" quay ", " qy "],
    ["plats ", " pl "],
    [" rise ", " ri "],
    [" berg ", " bg "],
    [" tcty ", " tct "],
    [" viad ", " via "],
    [" view ", " vw "],
    [" vdct ", " via "],
    [" vale ", " v "],
    [" avda ", " av "],
    [" grad ", " ghr "],
    [" walk ", " wlk "],
    [" west ", " w "],
    [" yard ", " yd "],
    [" blok ", " bl "],
    [" terr ", " ter "],
    [" cmno ", " cno "],
    [" stra ", " st "],
    [" thfr ", " thor "],
    [" turn ", " tn "],
    [" tpke ", " tpk "],
    [" burg ", " bg "],
    ["vayla ", " vla "],
    ["vagen ", " v "],
    [" tori ", " tr "],
    ["gatan ", " g "],
    ["grand ", " gr "],
    [" pass ", " ps "],
    [" pkwy ", " pwy "],
    [" park ", " pk "],
    ["rinne ", " rn "],
    [" mtwy ", " mwy "],
    [" mndr ", " mr "],
    [" kyla ", " kl "],
    [" kuja ", " kj "],
    ["platz ", " pl "],
    ["ranta ", " rt "],
    [" mile ", " mi "],
    [" pfad ", " p "],
    [" mews ", " m "],
    ["polku ", " p "],
    [" psge ", " ps "],
    [" plza ", " pl "],
    ["ostra ", " o "],
    ["gamla ", " gla "],
    [" stig ", " st "],
    ["norra ", " n "],
    ["sodra ", " s "],
    [" pike ", " pk "],
    [" dorf ", " df "],
    [" piaz ", " p za "],
    [" phwy ", " pway "],
    ["pfad ", " p "],
    [" mnt ", " mt "],
    ["gata ", " g "],
    [" bhf ", " bf "],
    [" bad ", " b "],
    ["gate ", " g "],
    [" zum ", " z "],
    ["stig ", " st "],
    [" blv ", " bd "],
    ["kuja ", " kj "],
    [" bul ", " bd "],
    [" str ", " st "],
    ["alue ", " al "],
    [" cen ", " ctr "],
    [" ave ", " av "],
    ["kyla ", " kl "],
    [" ale ", " al "],
    [" spl ", " sp "],
    [" all ", " al "],
    [" k s ", " ks "],
    [" aly ", " al "],
    ["dorf ", " df "],
    [" bvd ", " bd "],
    [" vag ", " v "],
    [" iii ", " 3 "],
    [" tie ", " t "],
    [" sok ", " sk "],
    ["burg ", " bg "],
    ["katu ", " k "],
    ["berg ", " bg "],
    ["tori ", " tr "],
    [" kte ", " k "],
    [" gro ", " gr "],
    [" grn ", " gn "],
    [" gld ", " gl "],
    [" san ", " s "],
    [" hse ", " ho "],
    [" gte ", " g "],
    [" rte ", " rt "],
    [" rue ", " r "],
    [" che ", " ch "],
    [" pas ", " ps "],
    [" plz ", " pl "],
    [" pnt ", " pt "],
    [" pky ", " pwy "],
    [" pza ", " pl "],
    [" rvr ", " r "],
    [" riv ", " r "],
    [" lit ", " lt "],
    [" p k ", " pk "],
    [" lwr ", " lr "],
    [" low ", " lr "],
    [" sth ", " s "],
    [" crk ", " cr "],
    ["pres ", " pres "],
    ["laan ", " ln "],
    [" bda ", " b "],
    [" vei ", " v "],
    [" via ", " v "],
    [" way ", " wy "],
    [" upr ", " up "],
    [" avd ", " av "],
    [" crt ", " ct "],
    ["stwg ", " stwg "],
    ["sint ", " st "],
    [" v d ", " vd "],
    [" van ", " v "],
    [" drv ", " dr "],
    [" tce ", " ter "],
    [" va ", " v "],
    [" oa ", " o "],
    [" sa ", " s "],
    [" na ", " n "],
    ["bgm ", " bgm "],
    [" nw ", " n "],
    ["vag ", " v "],
    [" im ", " 1 "],
    ["vla ", " vla "],
    ["gla ", " gla "],
    [" am ", " a "],
    [" ph ", " p "],
    ["rue ", " r "],
    [" ga ", " g "],
    ["ste ", " ste "],
    ["str ", " st "],
    [" cl ", " c "],
    [" vn ", " v "],
    [" gt ", " g "],
    ["vei ", " v "],
    ["vlt ", " vlt "],
    [" ce ", " cv "],
    [" ii ", " 2 "],
    ["pln ", " pln "],
    ["olv ", " olv "],
    ["mkt ", " mkt "],
    ["tvl ", " tvl "],
    [" ob ", " o "],
    ["pgr ", " pgr "],
    [" in ", " 1 "],
    [" mw ", " m "],
    ["kri ", " kri "],
    ["pko ", " pko "],
    ["auk ", " auk "],
    ["tie ", " t "],
    [" i ", " 1 "]
  ]
 }