Skip to content

Commit 09f92de

Browse files
author
Xing Zhang
committed
WL#11825: Add Chinese collation for utf8mb4
We need to reorder characters to implement this Chinese collation because of the CLDR rule [reorder Han]. We divide all Unicode characters into five parts: 1. The core group (spaces and symbols). We don't change the weight of the characters in this group. They sort before all other characters as in the DUCET. 2. 41336 Han characters whose sorting order have been defined by CLDR. These characters sort after the characters of part 1. 3. All other Han characters. These characters sort after the Han characters of part 2. 4. Character groups which are between the core group and the Han group in the DUCET. We need to give them bigger weight than all Han characters. So they sort after the characters of part 3. 5. All other characters. Both CLDR v29 and v30 are incomplete and are missing some very common Han characters (like “small”). Thus we will use the zh.xml file from CLDR v33 to implement this collation. Changed uca9-dump.cc to make uca9dump can generate weight table file for Chinese and Japanese languages at build time. Chinese collation regression test added. Benchmark result comparing to the Japanese collation: BM_Chinese_AS_CS 18162 ns/iter 25.20 MB/sec BM_Japanese_AS_CS 21975 ns/iter 14.06 MB/sec Benchmark result showing its effect to other collations: BM_SimpleUTF8MB4 2199 -> 2157 ns/iter [+ 1.95%] BM_MixedUTF8MB4 1703 -> 1707 ns/iter [- 0.23%] BM_MixedUTF8MB4_AS_CI 3523 -> 3409 ns/iter [+ 3.34%] BM_MixedUTF8MB4_AS_CS 5065 -> 5049 ns/iter [+ 0.32%] BM_JapaneseUTF8MB4 3659 -> 3693 ns/iter [- 0.92%] BM_Hungarian_AS_CS 36518 -> 37603 ns/iter [- 2.89%] BM_Japanese_AS_CS 21684 -> 21880 ns/iter [- 0.90%] BM_Japanese_AS_CS_KS 29542 -> 29622 ns/iter [- 0.27%] Change-Id: I70c3bd971c4d45ca255b8cd3406535e953e60d56
1 parent 8c8d13e commit 09f92de

22 files changed

+26572
-128772
lines changed

include/mysys_err.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ extern const char *globerrs[]; /* my_error_messages is here */
132132
#define EE_SHIFT_CHAR_OUT_OF_RANGE 88
133133
#define EE_RESET_CHAR_OUT_OF_RANGE 89
134134
#define EE_UNKNOWN_LDML_TAG 90
135-
#define EE_ERROR_LAST 90 /* Copy last error nr */
135+
#define EE_FAILED_TO_RESET_BEFORE_SECONDARY_IGNORABLE_CHAR 91
136+
#define EE_ERROR_LAST 91 /* Copy last error nr */
136137
/* Add error numbers before EE_ERROR_LAST and change it accordingly. */
137138

138139
/* Exit codes for option processing. When exiting from server use the

mysql-test/r/ctype_ldml.result

+1
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,7 @@ utf8mb4_ja_0900_as_cs_ks utf8mb4 304
529529
utf8mb4_0900_as_ci utf8mb4 305
530530
utf8mb4_ru_0900_ai_ci utf8mb4 306
531531
utf8mb4_ru_0900_as_cs utf8mb4 307
532+
utf8mb4_zh_0900_as_cs utf8mb4 308
532533
utf8mb4_test_ci utf8mb4 326
533534
utf16_test_ci utf16 327
534535
utf8mb4_test_400_ci utf8mb4 328

mysql-test/r/ctype_unicode900_as_cs.result

+61
Original file line numberDiff line numberDiff line change
@@ -2014,3 +2014,64 @@ DROP TABLE t1;
20142014
#
20152015
# End of 5.8 tests
20162016
#
2017+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
2018+
INSERT INTO t1 VALUES(_utf16 0x2E87), (_utf16 0x2E8D), (_utf16 0x2F17),
2019+
(_utf16 0x3038), (_utf16 0x24B6), (_utf32 0x1F150), (_utf16 0x4E2D),
2020+
(_utf16 0x3197), (_utf32 0x1F22D), ('A'), ('a'), ('Z'), ('z'),
2021+
(_utf16 0x3082), (_utf16 0x30E2), (_utf16 0x2E31), (_utf16 0x33E8),
2022+
(_utf32 0x1F229), (_utf32 0x1F241), (_utf16 0xFA56);
2023+
SELECT HEX(CONVERT(a USING utf32)), HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, HEX(a);
2024+
HEX(CONVERT(a USING utf32)) HEX(WEIGHT_STRING(a))
2025+
00002E31 028C0000002000000002
2026+
0001F241 0379815D037A000000200020002000000002000200020021
2027+
000033E8 1C467F7E0000002000200000000200020021
2028+
00002E87 4CDF000000200110000000040004
2029+
0000FA56 51CD0000002000000002
2030+
00002F17 857A00000020000000020021
2031+
00003038 857A00000020000000020022
2032+
00002E8D 9C310000002000000002
2033+
0001F229 A63E00000020000000020024
2034+
00004E2D B8200000002000000002
2035+
00003197 B82000000020000000020021
2036+
0001F22D B82000000020000000020023
2037+
00000061 BDC40000002000000002
2038+
00000041 BDC40000002000000008
2039+
000024B6 BDC4000000200000000C
2040+
0001F150 BDC4000000200000000C
2041+
0000007A C09E0000002000000002
2042+
0000005A C09E0000002000000008
2043+
00003082 DEFA000000200000000E
2044+
000030E2 DEFA0000002000000011
2045+
DROP TABLE t1;
2046+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
2047+
INSERT INTO t1 VALUES(_utf16 0x6C88), (_utf16 0x5F1E), (_utf16 0x9633),
2048+
(_utf16 0x6C889633), (_utf16 0x5F1E9633);
2049+
SELECT HEX(CONVERT(a USING utf32)), HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, HEX(a);
2050+
HEX(CONVERT(a USING utf32)) HEX(WEIGHT_STRING(a))
2051+
00006C88 289C0000002000000002
2052+
00005F1E 848C0000002000000002
2053+
00005F1E00009633 848CA41B000000200020000000020002
2054+
00006C8800009633 848CA41BF645000000200020000000020002
2055+
00009633 A41B0000002000000002
2056+
DROP TABLE t1;
2057+
CREATE TABLE t1(a VARCHAR(10), b VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
2058+
INSERT INTO t1 VALUES(_utf16 0xF902, _utf16 0x2F9E), (_utf16 0xF907, _utf16 0x2FD4),
2059+
(_utf16 0xF908, _utf16 0x2FD4), (_utf16 0xF9D1, _utf16 0x3285);
2060+
SELECT HEX(CONVERT(a USING utf16)) AS a_u16, HEX(CONVERT(b USING utf16)) AS b_u16, a = b FROM t1;
2061+
a_u16 b_u16 a = b
2062+
F902 2F9E 0
2063+
F907 2FD4 0
2064+
F908 2FD4 0
2065+
F9D1 3285 0
2066+
DROP TABLE t1;
2067+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
2068+
INSERT INTO t1 VALUES(_utf16 0x1EC2), (_utf16 0x1EC3), (_utf16 0x1EC5), (_utf16 0x1EC0), (_utf16 0x1EC7), (_Utf16 0x1EBF);
2069+
SELECT HEX(CONVERT(a USING utf16)) FROM t1 ORDER BY a;
2070+
HEX(CONVERT(a USING utf16))
2071+
1EC5
2072+
1EC3
2073+
1EC2
2074+
1EC7
2075+
1EBF
2076+
1EC0
2077+
DROP TABLE t1;

mysql-test/suite/collations/r/chinese.result

+24,877
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
--source suite/collations/include/unicode.inc
2+
select hex(convert(uc using utf32)),
3+
hex(weight_string(convert(uc using utf8mb4) collate utf8mb4_zh_0900_as_cs)),
4+
name from unicode
5+
where category in ('Lu','Ll','Lt','Lm','Lo','So')
6+
order by uc collate utf8mb4_zh_0900_as_cs, cp;
7+
8+
drop table if exists unicode;

mysql-test/suite/engines/funcs/r/db_alter_collate_ascii.result

+1
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ utf8mb4_unicode_ci utf8mb4 224 # # PAD SPACE
243243
utf8mb4_vietnamese_ci utf8mb4 247 # # PAD SPACE
244244
utf8mb4_vi_0900_ai_ci utf8mb4 277 # # NO PAD
245245
utf8mb4_vi_0900_as_cs utf8mb4 300 # # NO PAD
246+
utf8mb4_zh_0900_as_cs utf8mb4 308 # # NO PAD
246247
utf8_bin utf8 83 # # PAD SPACE
247248
utf8_croatian_ci utf8 213 # # PAD SPACE
248249
utf8_czech_ci utf8 202 # # PAD SPACE

mysql-test/suite/engines/funcs/r/db_alter_collate_utf8.result

+1
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ utf8mb4_unicode_ci utf8mb4 224 # # PAD SPACE
243243
utf8mb4_vietnamese_ci utf8mb4 247 # # PAD SPACE
244244
utf8mb4_vi_0900_ai_ci utf8mb4 277 # # NO PAD
245245
utf8mb4_vi_0900_as_cs utf8mb4 300 # # NO PAD
246+
utf8mb4_zh_0900_as_cs utf8mb4 308 # # NO PAD
246247
utf8_bin utf8 83 # # PAD SPACE
247248
utf8_croatian_ci utf8 213 # # PAD SPACE
248249
utf8_czech_ci utf8 202 # # PAD SPACE

mysql-test/suite/innodb/r/innodb-2byte-collation.result

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ utf8mb4_ja_0900_as_cs_ks utf8mb4 304
5151
utf8mb4_0900_as_ci utf8mb4 305
5252
utf8mb4_ru_0900_ai_ci utf8mb4 306
5353
utf8mb4_ru_0900_as_cs utf8mb4 307
54+
utf8mb4_zh_0900_as_cs utf8mb4 308
5455
utf8mb4_test_ci utf8mb4 326
5556
utf16_test_ci utf16 327
5657
utf8mb4_test_400_ci utf8mb4 328

mysql-test/suite/innodb/r/innodb_ctype_ldml.result

+1
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ utf8mb4_ja_0900_as_cs_ks utf8mb4 304
505505
utf8mb4_0900_as_ci utf8mb4 305
506506
utf8mb4_ru_0900_ai_ci utf8mb4 306
507507
utf8mb4_ru_0900_as_cs utf8mb4 307
508+
utf8mb4_zh_0900_as_cs utf8mb4 308
508509
utf8mb4_test_ci utf8mb4 326
509510
utf16_test_ci utf16 327
510511
utf8mb4_test_400_ci utf8mb4 328

mysql-test/t/ctype_unicode900_as_cs.test

+45
Original file line numberDiff line numberDiff line change
@@ -264,3 +264,48 @@ DROP TABLE t1;
264264
--echo #
265265
--echo # End of 5.8 tests
266266
--echo #
267+
268+
# Test the characters in different groups are reordered correctly. For example,
269+
# U+33E8 is in the core group, and U+2F17 is in the Han group, and 'A' is in
270+
# the latin group. According to the reorder rule defined by the CLDR for the
271+
# Chinese collation, we should get U+33E8 < U+2F17 < 'A'. This also tests how
272+
# different glyphs of one Han character sort according to the weight shift rule
273+
# defined by CLDR. For example, U+3197 (IDEOGRAPHIC ANNOTATION MIDDLE MARK) and
274+
# U+4E2D (CJK UNIFIED IDEOGRAPH-4E2D) are different glyphs of a Chinese
275+
# character which means 'middle' and the CLDR defines "U+412D <<< U+3197".
276+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
277+
INSERT INTO t1 VALUES(_utf16 0x2E87), (_utf16 0x2E8D), (_utf16 0x2F17),
278+
(_utf16 0x3038), (_utf16 0x24B6), (_utf32 0x1F150), (_utf16 0x4E2D),
279+
(_utf16 0x3197), (_utf32 0x1F22D), ('A'), ('a'), ('Z'), ('z'),
280+
(_utf16 0x3082), (_utf16 0x30E2), (_utf16 0x2E31), (_utf16 0x33E8),
281+
(_utf32 0x1F229), (_utf32 0x1F241), (_utf16 0xFA56);
282+
SELECT HEX(CONVERT(a USING utf32)), HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, HEX(a);
283+
DROP TABLE t1;
284+
285+
# Test how the contraction of Han characters sorts. For example, U+6C88 and
286+
# U+5F1E are differenct characters, and U+6C88 < U+5F1E. But the strings
287+
# U+6C88U+9633 and U+5F1EU+9633 mean same thing. In such a contraction case,
288+
# U+5F1EU+9633 < U+6C88U+9633.
289+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
290+
INSERT INTO t1 VALUES(_utf16 0x6C88), (_utf16 0x5F1E), (_utf16 0x9633),
291+
(_utf16 0x6C889633), (_utf16 0x5F1E9633);
292+
SELECT HEX(CONVERT(a USING utf32)), HEX(WEIGHT_STRING(a)) FROM t1 ORDER BY a, HEX(a);
293+
DROP TABLE t1;
294+
295+
# This tests how different glyphs of one Han character sort. For example,
296+
# U+2F9E (KANGXI RADICAL CART) and U+F902 (CJK COMPATIBILITY IDEOGRAPH-F902)
297+
# are different glyphs of Chinese character which means 'cart'.
298+
CREATE TABLE t1(a VARCHAR(10), b VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
299+
INSERT INTO t1 VALUES(_utf16 0xF902, _utf16 0x2F9E), (_utf16 0xF907, _utf16 0x2FD4),
300+
(_utf16 0xF908, _utf16 0x2FD4), (_utf16 0xF9D1, _utf16 0x3285);
301+
SELECT HEX(CONVERT(a USING utf16)) AS a_u16, HEX(CONVERT(b USING utf16)) AS b_u16, a = b FROM t1;
302+
DROP TABLE t1;
303+
304+
# CLDR defines some weight shift rules for Chinese Bopomofo characters.
305+
# Bopomofo is a group of latin characters used to illustrate how a Han character
306+
# is pronounced. For example, 'e' is one of Bopomofo characters. This tests
307+
# how accented latin character which is not in Bopomofo group should be sorted.
308+
CREATE TABLE t1(a VARCHAR(10)) COLLATE utf8mb4_zh_0900_as_cs;
309+
INSERT INTO t1 VALUES(_utf16 0x1EC2), (_utf16 0x1EC3), (_utf16 0x1EC5), (_utf16 0x1EC0), (_utf16 0x1EC7), (_Utf16 0x1EBF);
310+
SELECT HEX(CONVERT(a USING utf16)) FROM t1 ORDER BY a;
311+
DROP TABLE t1;

mysys/charset-def.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
1+
/* Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
22
33
This program is free software; you can redistribute it and/or modify
44
it under the terms of the GNU General Public License, version 2.0,
@@ -248,6 +248,7 @@ extern CHARSET_INFO my_charset_utf8mb4_ja_0900_as_cs;
248248
extern CHARSET_INFO my_charset_utf8mb4_ja_0900_as_cs_ks;
249249
extern CHARSET_INFO my_charset_utf8mb4_0900_as_ci;
250250
extern CHARSET_INFO my_charset_utf8mb4_ru_0900_as_cs;
251+
extern CHARSET_INFO my_charset_utf8mb4_zh_0900_as_cs;
251252

252253
extern CHARSET_INFO my_charset_gb18030_unicode_520_ci;
253254

@@ -425,6 +426,7 @@ bool init_compiled_charsets(myf flags MY_ATTRIBUTE((unused))) {
425426
add_compiled_collation(&my_charset_utf8mb4_ja_0900_as_cs_ks);
426427
add_compiled_collation(&my_charset_utf8mb4_0900_as_ci);
427428
add_compiled_collation(&my_charset_utf8mb4_ru_0900_as_cs);
429+
add_compiled_collation(&my_charset_utf8mb4_zh_0900_as_cs);
428430

429431
add_compiled_collation(&my_charset_utf16_general_ci);
430432
add_compiled_collation(&my_charset_utf16_bin);

mysys/errors.cc

+3-2
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,11 @@ const char *globerrs[GLOBERRS] = {
134134
"Invalid decimal value for option '%s'.",
135135
"%s.",
136136
"Failed to reset before a primary ignorable character %s.",
137-
"Failed to reset before a territory ignorable character %s.",
137+
"Failed to reset before a tertiary ignorable character %s.",
138138
"Shift character out of range: %s.",
139139
"Reset character out of range: %s.",
140-
"Unknown LDML tag: '%.*s'."};
140+
"Unknown LDML tag: '%.*s'.",
141+
"Failed to reset before a secondary ignorable character %s."};
141142

142143
/*
143144
We cannot call my_error/my_printf_error here in this function.

strings/CMakeLists.txt

+24-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2006, 2017, Oracle and/or its affiliates. All rights reserved.
1+
# Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved.
22
#
33
# This program is free software; you can redistribute it and/or modify
44
# it under the terms of the GNU General Public License, version 2.0,
@@ -67,11 +67,32 @@ IF(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND
6767
ADD_COMPILE_FLAGS(dtoa.cc COMPILE_FLAGS "-fno-strict-aliasing")
6868
ENDIF()
6969

70+
MYSQL_ADD_EXECUTABLE(uca9dump uca9-dump.cc SKIP_INSTALL)
71+
72+
SET(ZH_HANS_SRC_FILE ${CMAKE_SOURCE_DIR}/strings/lang_data/zh_hans.txt)
73+
SET(ZH_HANS_DST_FILE ${CMAKE_BINARY_DIR}/strings/uca900_zh_tbls.cc)
74+
SET(JA_HANS_SRC_FILE ${CMAKE_SOURCE_DIR}/strings/lang_data/ja_hans.txt)
75+
SET(JA_HANS_DST_FILE ${CMAKE_BINARY_DIR}/strings/uca900_ja_tbls.cc)
76+
ADD_CUSTOM_COMMAND(OUTPUT ${ZH_HANS_DST_FILE}
77+
${JA_HANS_DST_FILE}
78+
COMMAND uca9dump zh
79+
--in_file=${ZH_HANS_SRC_FILE}
80+
--out_file=${ZH_HANS_DST_FILE}
81+
COMMAND uca9dump ja
82+
--in_file=${JA_HANS_SRC_FILE}
83+
--out_file=${JA_HANS_DST_FILE}
84+
DEPENDS uca9dump ${ZH_HANS_SRC_FILE} ${JA_HANS_SRC_FILE}
85+
)
86+
87+
SET_SOURCE_FILES_PROPERTIES(
88+
${JA_HANS_DST_FILE} ${ZH_HANS_DST_FILE}
89+
PROPERTIES GENERATED TRUE
90+
)
91+
92+
LIST(APPEND STRINGS_SOURCES ${JA_HANS_DST_FILE} ${ZH_HANS_DST_FILE})
7093
# Avoid dependencies on perschema data defined in mysys
7194
ADD_DEFINITIONS(-DDISABLE_MYSQL_THREAD_H)
7295
ADD_CONVENIENCE_LIBRARY(strings ${STRINGS_SOURCES})
7396

7497
MYSQL_ADD_EXECUTABLE(conf_to_src conf_to_src.cc SKIP_INSTALL)
7598
TARGET_LINK_LIBRARIES(conf_to_src strings)
76-
77-
MYSQL_ADD_EXECUTABLE(uca9dump uca9-dump.cc SKIP_INSTALL)

0 commit comments

Comments
 (0)