ignore Unicode format characters for normalization

Also adds tests.

Fixes #1007.
This commit is contained in:
Sarah Hoffmann
2018-04-10 22:48:17 +02:00
parent 28ee59dd64
commit ae83ceab5e
5 changed files with 22 additions and 2 deletions

View File

@@ -20,7 +20,7 @@ if (isset($_GET['debug']) && $_GET['debug']) @define('CONST_Debug', true);
// Rules for normalizing terms for comparison before doing comparisons. // Rules for normalizing terms for comparison before doing comparisons.
// The default is to remove accents and punctuation and to lower-case the // The default is to remove accents and punctuation and to lower-case the
// term. Spaces are kept but collapsed to one standard space. // term. Spaces are kept but collapsed to one standard space.
@define('CONST_Term_Normalization_Rules', ":: NFD (); [:Nonspacing Mark:] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();"); @define('CONST_Term_Normalization_Rules', ":: NFD (); [[:Nonspacing Mark:] [:Cf:]] >; :: lower (); [[:Punctuation:][:Space:]]+ > ' '; :: NFC ();");
// Set to false to avoid importing extra postcodes for the US. // Set to false to avoid importing extra postcodes for the US.
@define('CONST_Use_Extra_US_Postcodes', true); @define('CONST_Use_Extra_US_Postcodes', true);

View File

@@ -95,7 +95,6 @@ Feature: Import of postcodes
| object | postcode | | object | postcode |
| W93 | 445023 | | W93 | 445023 |
@wip
Scenario: Postcodes from admin boundaries are preferred over estimated postcodes Scenario: Postcodes from admin boundaries are preferred over estimated postcodes
Given the scene admin-areas Given the scene admin-areas
And the named places And the named places

View File

@@ -136,3 +136,13 @@ Feature: Import and search of names
Then results contain Then results contain
| ID | osm_type | osm_id | | ID | osm_type | osm_id |
| 0 | R | 1 | | 0 | R | 1 |
Scenario: Unprintable characters in postcodes are ignored
Given the named places
| osm | class | type | address |
| N234 | amenity | prison | 'postcode' : u'1234\u200e' |
When importing
And searching for "1234"
Then results contain
| ID | osm_type |
| 0 | P |

View File

@@ -96,6 +96,15 @@ Feature: Tag evaluation
| N3 | 'name: de' : 'Foo', 'name:\\\\' : 'real3' | | N3 | 'name: de' : 'Foo', 'name:\\\\' : 'real3' |
| N4 | 'name: de' : 'Foo', 'name' : 'rea\\l3' | | N4 | 'name: de' : 'Foo', 'name' : 'rea\\l3' |
Scenario: Unprintable character in address tag are maintained
When loading osm data
"""
n23 Tamenity=yes,name=foo,addr:postcode=1234%200e%
"""
Then place contains
| object | address |
| N23 | 'postcode' : u'1234\u200e' |
Scenario Outline: Included places Scenario Outline: Included places
When loading osm data When loading osm data
""" """

View File

@@ -22,6 +22,8 @@ class PlaceColumn:
self.add_hstore('extratags', key[6:], value) self.add_hstore('extratags', key[6:], value)
elif key.startswith('addr+'): elif key.startswith('addr+'):
self.add_hstore('address', key[5:], value) self.add_hstore('address', key[5:], value)
elif key in ('name', 'address', 'extratags'):
self.columns[key] = eval('{' + value + '}')
else: else:
assert_in(key, ('class', 'type')) assert_in(key, ('class', 'type'))
self.columns[key] = None if value == '' else value self.columns[key] = None if value == '' else value