Skip to content

Commit 127492f

Browse files
authored
Merge pull request #160 from Kakueeen/master
fix a bug of ChineseTokenizer
2 parents df65bf5 + fd9eaf1 commit 127492f

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
3838
if (text.length() > 1) {
3939
return true;
4040
}
41-
} else if (UnicodeUtil::isOther(text[0])) {
41+
} else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
4242
// One Chinese character as one Chinese word.
4343
// Chinese word extraction to be added later here.
4444
return true;

src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
6565

6666
length = 0;
6767
start = offset;
68+
bool last_is_en = false, last_is_num = false;
6869

6970
while (true) {
7071
wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
8283
c = ioBuffer[bufferIndex++];
8384
}
8485

85-
if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
86+
if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
87+
if (last_is_num) {
88+
--bufferIndex;
89+
--offset;
90+
return flush();
91+
}
92+
93+
push(c);
94+
if (length == MAX_WORD_LEN) {
95+
return flush();
96+
}
97+
last_is_en = true;
98+
} else if (UnicodeUtil::isDigit(c)) {
99+
if (last_is_en) {
100+
--bufferIndex;
101+
--offset;
102+
return flush();
103+
}
104+
86105
push(c);
87106
if (length == MAX_WORD_LEN) {
88107
return flush();
89108
}
109+
last_is_num = true;
90110
} else if (UnicodeUtil::isOther(c)) {
91111
if (length > 0) {
92112
--bufferIndex;

0 commit comments

Comments
 (0)