remove special status of partial tokens

Full-word tokens are no longer marked by a space at the beginning of the token. Use the new Partial token category instead. This removes a couple of special casing, we don't really need. The word table still has the space for compatibility reasons, so the tokenizer code needs to get rid of it when loading the tokens.
2026-02-16 15:47:58 +00:00 · 2021-07-14 22:17:17 +02:00
parent 6070c3d1d5
commit 143ff14466
5 changed files with 34 additions and 69 deletions
--- a/lib-php/tokenizer/legacy_tokenizer.php
+++ b/lib-php/tokenizer/legacy_tokenizer.php
@@ -137,14 +137,14 @@ class Tokenizer

            // Try more interpretations for Tokens that could not be matched.
            foreach ($aTokens as $sToken) {
-                if ($sToken[0] == ' ' && !$oValidTokens->contains($sToken)) {
-                    if (preg_match('/^ ([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
+                if ($sToken[0] != ' ' && !$oValidTokens->contains($sToken)) {
+                    if (preg_match('/^([0-9]{5}) [0-9]{4}$/', $sToken, $aData)) {
                        // US ZIP+4 codes - merge in the 5-digit ZIP code
                        $oValidTokens->addToken(
                            $sToken,
                            new Token\Postcode(null, $aData[1], 'us')
                        );
-                    } elseif (preg_match('/^ [0-9]+$/', $sToken)) {
+                    } elseif (preg_match('/^[0-9]+$/', $sToken)) {
                        // Unknown single word token with a number.
                        // Assume it is a house number.
                        $oValidTokens->addToken(