ignore Unicode format characters for normalization

Also adds tests. Fixes #1007.
2026-02-26 11:08:13 +00:00 · 2018-04-10 22:48:17 +02:00
parent 28ee59dd64
commit ae83ceab5e
5 changed files with 22 additions and 2 deletions
--- a/settings/defaults.php
+++ b/settings/defaults.php
@@ -20,7 +20,7 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
 // Rules for normalizing terms for comparison before doing comparisons.
 // The default is to remove accents and punctuation and to lower-case the
 // term. Spaces are kept but collapsed to one standard space.
-@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
+@define('CONST_Term_Normalization_Rules', ":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >;  :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
 // Set to false to avoid importing extra postcodes for the US.
@define('CONST_Use_Extra_US_Postcodes', true);
--- a/test/bdd/db/import/postcodes.feature
+++ b/test/bdd/db/import/postcodes.feature
@@ -95,7 +95,6 @@ Feature: Import of postcodes
            | object | postcode |
            | W93    | 445023   |
    @wip
    Scenario: Postcodes from admin boundaries are preferred over estimated postcodes
        Given the scene admin-areas
        And the named places
--- a/test/bdd/db/query/normalization.feature
+++ b/test/bdd/db/query/normalization.feature
@@ -136,3 +136,13 @@ Feature: Import and search of names
        Then results contain
         | ID | osm_type | osm_id |
         | 0  | R        | 1 |
     Scenario: Unprintable characters in postcodes are ignored
        Given the named places
            | osm  | class   | type   | address |
            | N234 | amenity | prison | 'postcode' : u'1234\u200e' |
        When importing
        And searching for "1234"
        Then results contain
         | ID | osm_type |
         | 0  | P        |
--- a/test/bdd/osm2pgsql/import/tags.feature
+++ b/test/bdd/osm2pgsql/import/tags.feature
@@ -96,6 +96,15 @@ Feature: Tag evaluation
         | N3     | 'name: de' : 'Foo', 'name:\\\\' : 'real3' |
         | N4     | 'name: de' : 'Foo', 'name' : 'rea\\l3' |
    Scenario: Unprintable character in address tag are maintained
        When loading osm data
         """
         n23 Tamenity=yes,name=foo,addr:postcode=1234%200e%
         """
        Then place contains
         | object | address |
         | N23    | 'postcode' : u'1234\u200e' |
    Scenario Outline: Included places
        When loading osm data
         """
--- a/test/bdd/steps/db_ops.py
+++ b/test/bdd/steps/db_ops.py
@@ -22,6 +22,8 @@ class PlaceColumn:
            self.add_hstore('extratags', key[6:], value)
        elif key.startswith('addr+'):
            self.add_hstore('address', key[5:], value)
        elif key in ('name', 'address', 'extratags'):
            self.columns[key] = eval('{' + value + '}')
        else:
            assert_in(key, ('class', 'type'))
            self.columns[key] = None if value == '' else value