reorganize keyword creation for legacy tokenizer

- only save partial words without internal spaces
- consider comma and semicolon a separator of full words
- consider parts before an opening bracket a full word
  (but not the part after the bracket)

Fixes #244.
This commit is contained in:
Sarah Hoffmann
2021-05-23 23:58:58 +02:00
parent fa3e48c59f
commit 4f4d15c28a
4 changed files with 85 additions and 29 deletions

View File

@@ -223,11 +223,32 @@ def test_update_special_phrase_modify(analyzer, word_table):
def test_process_place_names(analyzer, getorcreate_term_id):
with analyzer() as anl:
info = anl.process_place({'name' : {'name' : 'Soft bAr', 'ref': '34'}})
assert info['names'] == '{1,2,3,4,5,6}'
assert info['names'] == '{1,2,3,4,5}'
@pytest.mark.parametrize('sep', [',' , ';'])
def test_full_names_with_separator(analyzer, getorcreate_term_id, sep):
with analyzer() as anl:
full_names =
anl._compute_full_names({'name' : sep.join(('New York', 'Big Apple'))})
expect = set((anl.make_standard_word(w) for w in ('New York', 'Big Apple')))
assert full_names == expect
def test_process_place_names_with_bracket(analyzer, getorcreate_term_id):
with analyzer() as anl:
info = anl.process_place({'name' :
{'name' : 'Houseboat (left)'}})
expect = set((anl.make_standard_word(w) for w in
(' houseboat', ' houseboat left', 'houseboat', 'left')))
assert eval(info['names']) == expect
@pytest.mark.parametrize('pcode', ['12345', 'AB 123', '34-345'])