Skip to content

Commit fd9eaf1

Browse files
committed
Title:fix a bug of ChineseTokenizer
Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated. RootCause:Null Solution:
1 parent 9a48a2a commit fd9eaf1

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
3838
if (text.length() > 1) {
3939
return true;
4040
}
41-
} else if (UnicodeUtil::isOther(text[0])) {
41+
} else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
4242
// One Chinese character as one Chinese word.
4343
// Chinese word extraction to be added later here.
4444
return true;

src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
6565

6666
length = 0;
6767
start = offset;
68+
bool last_is_en = false, last_is_num = false;
6869

6970
while (true) {
7071
wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
8283
c = ioBuffer[bufferIndex++];
8384
}
8485

85-
if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
86+
if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
87+
if (last_is_num) {
88+
--bufferIndex;
89+
--offset;
90+
return flush();
91+
}
92+
93+
push(c);
94+
if (length == MAX_WORD_LEN) {
95+
return flush();
96+
}
97+
last_is_en = true;
98+
} else if (UnicodeUtil::isDigit(c)) {
99+
if (last_is_en) {
100+
--bufferIndex;
101+
--offset;
102+
return flush();
103+
}
104+
86105
push(c);
87106
if (length == MAX_WORD_LEN) {
88107
return flush();
89108
}
109+
last_is_num = true;
90110
} else if (UnicodeUtil::isOther(c)) {
91111
if (length > 0) {
92112
--bufferIndex;

0 commit comments

Comments
 (0)