Skip to content

Commit 7502c86

Browse files
committed
Add test suite for UTF-{7,8,16,32}
Also fix a couple small problems with UTF-32 and UTF-8 support: - UTF-32 would pass very large codepoints (>= 0x80000000), which are not valid. - UTF-8 would sometimes emit two error marker characters for a single bad input byte.
1 parent cc0aac3 commit 7502c86

File tree

4 files changed

+1048
-35
lines changed

4 files changed

+1048
-35
lines changed

ext/mbstring/libmbfl/filters/mbfilter_utf32.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf32le = {
131131

132132
static int emit_char_if_valid(int n, mbfl_convert_filter *filter)
133133
{
134-
if (n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
134+
if (n >= 0 && n < MBFL_WCSPLANE_UTF32MAX && (n < 0xD800 || n > 0xDFFF)) {
135135
CK((*filter->output_function)(n, filter->data));
136136
} else {
137137
n = (n & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH;

ext/mbstring/libmbfl/filters/mbfilter_utf8.c

Lines changed: 15 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,11 @@ const struct mbfl_convert_vtbl vtbl_wchar_utf8 = {
8686

8787
int mbfl_filt_put_invalid_char(int c, mbfl_convert_filter *filter)
8888
{
89-
int w;
90-
w = c & MBFL_WCSGROUP_MASK;
91-
w |= MBFL_WCSGROUP_THROUGH;
92-
filter->status = 0;
93-
filter->cache = 0;
94-
CK((*filter->output_function)(w, filter->data));
89+
filter->status = filter->cache = 0;
90+
CK((*filter->output_function)((c & MBFL_WCSGROUP_MASK) | MBFL_WCSGROUP_THROUGH, filter->data));
9591
return 0;
9692
}
9793

98-
99-
/*
100-
* UTF-8 => wchar
101-
*/
10294
int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
10395
{
10496
int s, c1;
@@ -131,7 +123,8 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
131123
CK((*filter->output_function)(s, filter->data));
132124
} else {
133125
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
134-
goto retry;
126+
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
127+
goto retry;
135128
}
136129
break;
137130
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
@@ -146,7 +139,8 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
146139
filter->status++;
147140
} else {
148141
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
149-
goto retry;
142+
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
143+
goto retry;
150144
}
151145
break;
152146
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
@@ -161,7 +155,8 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
161155
filter->status++;
162156
} else {
163157
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
164-
goto retry;
158+
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
159+
goto retry;
165160
}
166161
break;
167162
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
@@ -170,7 +165,8 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
170165
filter->status++;
171166
} else {
172167
CK(mbfl_filt_put_invalid_char(filter->cache, filter));
173-
goto retry;
168+
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4))
169+
goto retry;
174170
}
175171
break;
176172
default:
@@ -183,27 +179,21 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
183179

184180
int mbfl_filt_conv_utf8_wchar_flush(mbfl_convert_filter *filter)
185181
{
186-
int status, cache;
187-
188-
status = filter->status;
189-
cache = filter->cache;
182+
int status = filter->status, cache = filter->cache;
190183

191-
filter->status = 0;
192-
filter->cache = 0;
184+
filter->status = filter->cache = 0;
193185

194-
if (status != 0) {
186+
if (status) {
195187
CK(mbfl_filt_put_invalid_char(cache, filter));
196188
}
197189

198-
if (filter->flush_function != NULL) {
190+
if (filter->flush_function) {
199191
(*filter->flush_function)(filter->data);
200192
}
193+
201194
return 0;
202195
}
203196

204-
/*
205-
* wchar => UTF-8
206-
*/
207197
int mbfl_filt_conv_wchar_utf8(int c, mbfl_convert_filter *filter)
208198
{
209199
if (c >= 0 && c < 0x110000) {

ext/mbstring/tests/illformed_utf_sequences.phpt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
2222
var_dump(chk_enc("\x41\x42\x43", 0));
2323
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
2424
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
25-
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
26-
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
27-
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
28-
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
25+
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
26+
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
27+
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
28+
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
2929
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
3030
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
3131
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
3232
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));
3333

3434
var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
35-
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
36-
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
35+
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
36+
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
3737
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
3838
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));
3939

4040
var_dump(chk_enc("\xc1\xbf", 2));
4141
var_dump(chk_enc("\xc2\x80", 0));
4242
var_dump(chk_enc("\xdf\xbf", 0));
43-
var_dump(chk_enc("\xe0\x9f\xff", 3));
43+
var_dump(chk_enc("\xe0\x9f\xff", 2));
4444
var_dump(chk_enc("\xe0\xa0\x80", 2));
4545
var_dump(chk_enc("\xef\xbf\xbf", 0));
46-
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
46+
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
4747
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
4848
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
4949
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
@@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
5858
$out = '';
5959
$cnt = 0;
6060
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
61-
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
61+
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
6262
if ($s === false) {
6363
$cnt++;
6464
} else {

0 commit comments

Comments
 (0)