Title:fix a bug of ChineseTokenizer

Kakueeen · Kakueeen · commit fd9eaf10c492 · 2020-12-04T15:47:26.000+08:00
Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.

RootCause:Null

Solution:
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
                 if (text.length() > 1) {
                     return true;
                 }
-            } else if (UnicodeUtil::isOther(text[0])) {
+            } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
                 // One Chinese character as one Chinese word.
                 // Chinese word extraction to be added later here.
                 return true;
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
 
     length = 0;
     start = offset;
+    bool last_is_en = false, last_is_num = false;
 
     while (true) {
         wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
             c = ioBuffer[bufferIndex++];
         }
 
-        if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+        if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+            if (last_is_num) {
+                --bufferIndex;
+                --offset;
+                return flush();
+            }
+
+            push(c);
+            if (length == MAX_WORD_LEN) {
+                return flush();
+            }
+            last_is_en = true;
+        } else if (UnicodeUtil::isDigit(c)) {
+            if (last_is_en) {
+                --bufferIndex;
+                --offset;
+                return flush();
+            }
+
             push(c);
             if (length == MAX_WORD_LEN) {
                 return flush();
             }
+            last_is_num = true;
         } else if (UnicodeUtil::isOther(c)) {
             if (length > 0) {
                 --bufferIndex;

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {`
`38`	`38`	`if (text.length() > 1) {`
`39`	`39`	`return true;`
`40`	`40`	`}`
`41`		`- } else if (UnicodeUtil::isOther(text[0])) {`
	`41`	`+ } else if (UnicodeUtil::isOther(text[0]) \|\| UnicodeUtil::isDigit(text[0])) {`
`42`	`42`	`// One Chinese character as one Chinese word.`
`43`	`43`	`// Chinese word extraction to be added later here.`
`44`	`44`	`return true;`