File tree Expand file tree Collapse file tree 2 files changed +22
-2
lines changed
src/contrib/analyzers/common/analysis/cn Expand file tree Collapse file tree 2 files changed +22
-2
lines changed Original file line number Diff line number Diff line change @@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
38
38
if (text.length () > 1 ) {
39
39
return true ;
40
40
}
41
- } else if (UnicodeUtil::isOther (text[0 ])) {
41
+ } else if (UnicodeUtil::isOther (text[0 ]) || UnicodeUtil::isDigit (text[ 0 ]) ) {
42
42
// One Chinese character as one Chinese word.
43
43
// Chinese word extraction to be added later here.
44
44
return true ;
Original file line number Diff line number Diff line change @@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
65
65
66
66
length = 0 ;
67
67
start = offset;
68
+ bool last_is_en = false , last_is_num = false ;
68
69
69
70
while (true ) {
70
71
wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
82
83
c = ioBuffer[bufferIndex++];
83
84
}
84
85
85
- if (UnicodeUtil::isDigit (c) || UnicodeUtil::isLower (c) || UnicodeUtil::isUpper (c)) {
86
+ if (UnicodeUtil::isLower (c) || UnicodeUtil::isUpper (c)) {
87
+ if (last_is_num) {
88
+ --bufferIndex;
89
+ --offset;
90
+ return flush ();
91
+ }
92
+
93
+ push (c);
94
+ if (length == MAX_WORD_LEN) {
95
+ return flush ();
96
+ }
97
+ last_is_en = true ;
98
+ } else if (UnicodeUtil::isDigit (c)) {
99
+ if (last_is_en) {
100
+ --bufferIndex;
101
+ --offset;
102
+ return flush ();
103
+ }
104
+
86
105
push (c);
87
106
if (length == MAX_WORD_LEN) {
88
107
return flush ();
89
108
}
109
+ last_is_num = true ;
90
110
} else if (UnicodeUtil::isOther (c)) {
91
111
if (length > 0 ) {
92
112
--bufferIndex;
You can’t perform that action at this time.
0 commit comments