21
21
22
22
package kotlinx.io
23
23
24
+ import kotlinx.io.internal.REPLACEMENT_CHARACTER
24
25
import kotlinx.io.internal.REPLACEMENT_CODE_POINT
25
- import kotlinx.io.internal.commonAsUtf8ToByteArray
26
26
import kotlinx.io.internal.processUtf8CodePoints
27
27
import kotlin.test.*
28
28
@@ -144,52 +144,109 @@ class Utf8Test {
144
144
145
145
@Test
146
146
fun bufferWriteCodePoints () {
147
+ bufferWriteCodePointsCheck(0 )
148
+ }
149
+
150
+ @Test
151
+ fun bufferWriteCodePointsCrossSegments () {
152
+ bufferWriteCodePointsCheck(Segment .SIZE - 1 )
153
+ }
154
+
155
+ private fun bufferWriteCodePointsCheck (prefixLength : Int ) {
147
156
val buffer = Buffer ()
148
- buffer.assertCodePointEncoded(" 40" , ' @' .code)
149
- buffer.assertCodePointEncoded(" 7f" , ' \u007f ' .code)
150
- buffer.assertCodePointEncoded(" c280" , ' \u0080 ' .code)
151
- buffer.assertCodePointEncoded(" c2a9" , ' \u00a9 ' .code)
152
- buffer.assertCodePointEncoded(" c3bf" , ' \u00ff ' .code)
153
- buffer.assertCodePointEncoded(" dfbf" , ' \u07ff ' .code)
154
- buffer.assertCodePointEncoded(" e0a080" , ' \u0800 ' .code)
155
- buffer.assertCodePointEncoded(" e1839a" , ' \u10da ' .code)
156
- buffer.assertCodePointEncoded(" efbfbf" , ' \uffff ' .code)
157
- buffer.assertCodePointEncoded(" f0908080" , 0x10000 )
158
- buffer.assertCodePointEncoded(" f48087bf" , 0x1001FF )
157
+ buffer.assertCodePointEncoded(" 40" , ' @' .code, prefixLength )
158
+ buffer.assertCodePointEncoded(" 7f" , ' \u007f ' .code, prefixLength )
159
+ buffer.assertCodePointEncoded(" c280" , ' \u0080 ' .code, prefixLength )
160
+ buffer.assertCodePointEncoded(" c2a9" , ' \u00a9 ' .code, prefixLength )
161
+ buffer.assertCodePointEncoded(" c3bf" , ' \u00ff ' .code, prefixLength )
162
+ buffer.assertCodePointEncoded(" dfbf" , ' \u07ff ' .code, prefixLength )
163
+ buffer.assertCodePointEncoded(" e0a080" , ' \u0800 ' .code, prefixLength )
164
+ buffer.assertCodePointEncoded(" e1839a" , ' \u10da ' .code, prefixLength )
165
+ buffer.assertCodePointEncoded(" efbfbf" , ' \uffff ' .code, prefixLength )
166
+ buffer.assertCodePointEncoded(" f0908080" , 0x10000 , prefixLength )
167
+ buffer.assertCodePointEncoded(" f48087bf" , 0x1001FF , prefixLength )
159
168
}
160
169
161
170
@Test
162
171
fun bufferReadCodePoints () {
172
+ bufferReadCodePointsCheck(0 )
173
+ }
174
+
175
+ @Test
176
+ fun bufferReadCodePointsCrossSegments () {
177
+ bufferReadCodePointsCheck(Segment .SIZE - 1 )
178
+ }
179
+
180
+ private fun bufferReadCodePointsCheck (prefixLength : Int ) {
163
181
val buffer = Buffer ()
164
- buffer.assertCodePointDecoded(' @' .code, " 40" )
165
- buffer.assertCodePointDecoded(' \u007f ' .code, " 7f" )
166
- buffer.assertCodePointDecoded(' \u0080 ' .code, " c280" )
167
- buffer.assertCodePointDecoded(' \u00a9 ' .code, " c2a9" )
168
- buffer.assertCodePointDecoded(' \u00ff ' .code, " c3bf" )
169
- buffer.assertCodePointDecoded(' \u07ff ' .code, " dfbf" )
170
- buffer.assertCodePointDecoded(' \u0800 ' .code, " e0a080" )
171
- buffer.assertCodePointDecoded(' \u10da ' .code, " e1839a" )
172
- buffer.assertCodePointDecoded(' \uffff ' .code, " efbfbf" )
173
- buffer.assertCodePointDecoded(0x10000 , " f0908080" )
174
- buffer.assertCodePointDecoded(0x1001FF , " f48087bf" )
182
+ buffer.assertCodePointDecoded(' @' .code, " 40" , prefixLength )
183
+ buffer.assertCodePointDecoded(' \u007f ' .code, " 7f" , prefixLength )
184
+ buffer.assertCodePointDecoded(' \u0080 ' .code, " c280" , prefixLength )
185
+ buffer.assertCodePointDecoded(' \u00a9 ' .code, " c2a9" , prefixLength )
186
+ buffer.assertCodePointDecoded(' \u00ff ' .code, " c3bf" , prefixLength )
187
+ buffer.assertCodePointDecoded(' \u07ff ' .code, " dfbf" , prefixLength )
188
+ buffer.assertCodePointDecoded(' \u0800 ' .code, " e0a080" , prefixLength )
189
+ buffer.assertCodePointDecoded(' \u10da ' .code, " e1839a" , prefixLength )
190
+ buffer.assertCodePointDecoded(' \uffff ' .code, " efbfbf" , prefixLength )
191
+ buffer.assertCodePointDecoded(0x10000 , " f0908080" , prefixLength )
192
+ buffer.assertCodePointDecoded(0x1001FF , " f48087bf" , prefixLength )
175
193
}
176
194
177
195
@Test
178
196
fun bufferWriteUtf8String () {
197
+ bufferWriteUtf8StringCheck(0 )
198
+ }
199
+
200
+ @Test
201
+ fun bufferWriteUtf8StringCrossSegments () {
202
+ bufferWriteUtf8StringCheck(Segment .SIZE - 1 )
203
+ }
204
+
205
+ private fun bufferWriteUtf8StringCheck (prefixLength : Int ) {
179
206
val buffer = Buffer ()
180
- buffer.assertUtf8StringEncoded(" 68656c6c6f" , " hello" )
181
- buffer.assertUtf8StringEncoded(" cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" , " χερετισμός" )
207
+ buffer.assertUtf8StringEncoded(" 68656c6c6f" , " hello" , prefixLength)
208
+ buffer.assertUtf8StringEncoded(" cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" , " χερετισμός" ,
209
+ prefixLength)
182
210
buffer.assertUtf8StringEncoded(
183
211
" e18392e18390e1839be18390e183a0e183afe1839de18391e18390" ,
184
- " გამარჯობა"
212
+ " გამარჯობა" ,
213
+ prefixLength
185
214
)
186
215
buffer.assertUtf8StringEncoded(
187
216
" f093878bf0938bb4f09380a5" ,
188
- " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " /* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
217
+ " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " ,/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
218
+ prefixLength
189
219
)
190
220
191
221
// two consecutive high surrogates, replace with '?'
192
- buffer.assertUtf8StringEncoded(" 3f3f" , " \ud801\uD801 " )
222
+ buffer.assertUtf8StringEncoded(" 3f3f" , " \ud801\uD801 " , prefixLength)
223
+ }
224
+
225
+ @Test
226
+ fun bufferReadUtf8String () {
227
+ bufferReadUtf8StringCheck(0 )
228
+ }
229
+
230
+ @Test
231
+ fun bufferReadUtf8StringCrossSegments () {
232
+ bufferReadUtf8StringCheck(Segment .SIZE - 1 )
233
+ }
234
+
235
+ private fun bufferReadUtf8StringCheck (prefixLength : Int ) {
236
+ val buffer = Buffer ()
237
+ buffer.assertUtf8StringDecoded(" hello" ," 68656c6c6f" , prefixLength)
238
+ buffer.assertUtf8StringDecoded(" χερετισμός" , " cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82" ,
239
+ prefixLength)
240
+ buffer.assertUtf8StringDecoded(
241
+ " გამარჯობა" ,
242
+ " e18392e18390e1839be18390e183a0e183afe1839de18391e18390" ,
243
+ prefixLength
244
+ )
245
+ buffer.assertUtf8StringDecoded(
246
+ " \uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25 " ,/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
247
+ " f093878bf0938bb4f09380a5" ,
248
+ prefixLength
249
+ )
193
250
}
194
251
195
252
@Test
@@ -258,6 +315,16 @@ class Utf8Test {
258
315
assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
259
316
assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
260
317
assertTrue(buffer.exhausted())
318
+
319
+ buffer.write(ByteArray (Segment .SIZE - 2 ))
320
+ buffer.write(" f888808080" .decodeHex())
321
+ buffer.skip(Segment .SIZE - 2L )
322
+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
323
+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
324
+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
325
+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
326
+ assertEquals(REPLACEMENT_CODE_POINT , buffer.readUtf8CodePoint())
327
+ assertTrue(buffer.exhausted())
261
328
}
262
329
263
330
@Test
@@ -307,6 +374,44 @@ class Utf8Test {
307
374
}
308
375
}
309
376
377
+ @Test
378
+ fun readStringWithUnderflow () {
379
+ val buffer = Buffer ()
380
+ // 3 byte-encoded, last byte missing
381
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " e183" )
382
+ // 3 byte-encoded, last two bytes missing
383
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " e1" )
384
+ // 2 byte-encoded, last byte missing
385
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " cf" )
386
+ // 4 byte encoded, various underflows
387
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f09383" )
388
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f093" )
389
+ buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER .toString(), " f0" )
390
+ }
391
+
392
+ @Test
393
+ fun readStringWithoutContinuationByte () {
394
+ val buffer = Buffer ()
395
+ // 2 byte-encoded, last byte corrupted
396
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " cf61" )
397
+ // 3 byte-encoded, last byte corrupted
398
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " e18361" )
399
+ // 3 byte-encoded, last two bytes corrupted
400
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aa" , " e16161" )
401
+ // 4 byte-encoded, various bytes corrupterd
402
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } a" , " f0938361" )
403
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aa" , " f0936161" )
404
+ buffer.assertUtf8StringDecoded(" ${REPLACEMENT_CHARACTER } aaa" , " f0616161" )
405
+ }
406
+
407
+ @OptIn(ExperimentalStdlibApi ::class )
408
+ @Test
409
+ fun encodeUtf16SurrogatePair () {
410
+ val buffer = Buffer ()
411
+ buffer.writeString(" \uD852\uDF62 " )
412
+ println (buffer.readByteArray().toHexString())
413
+ }
414
+
310
415
private fun assertEncoded (hex : String , vararg codePoints : Int ) {
311
416
assertCodePointDecoded(hex, * codePoints)
312
417
}
@@ -321,21 +426,34 @@ class Utf8Test {
321
426
assertEquals(i, codePoints.size) // Checked them all
322
427
}
323
428
324
- private fun Buffer.assertCodePointEncoded (expectedHex : String , codePoint : Int ) {
429
+ private fun Buffer.assertCodePointEncoded (expectedHex : String , codePoint : Int , prefixLength : Int = 0) {
430
+ write(ByteArray (prefixLength))
325
431
writeUtf8CodePoint(codePoint)
432
+ skip(prefixLength.toLong())
326
433
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
327
434
}
328
435
329
- private fun Buffer.assertCodePointDecoded (expectedCodePoint : Int , hex : String ) {
436
+ private fun Buffer.assertCodePointDecoded (expectedCodePoint : Int , hex : String , prefixLength : Int = 0) {
437
+ write(ByteArray (prefixLength))
330
438
write(hex.decodeHex())
439
+ skip(prefixLength.toLong())
331
440
assertEquals(expectedCodePoint, readUtf8CodePoint())
332
441
}
333
442
334
- private fun Buffer.assertUtf8StringEncoded (expectedHex : String , string : String ) {
443
+ private fun Buffer.assertUtf8StringEncoded (expectedHex : String , string : String , prefixLength : Int = 0) {
444
+ write(ByteArray (prefixLength))
335
445
writeString(string)
446
+ skip(prefixLength.toLong())
336
447
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
337
448
}
338
449
450
+ private fun Buffer.assertUtf8StringDecoded (expectedString : String , hex : String , prefixLength : Int = 0) {
451
+ write(ByteArray (prefixLength))
452
+ write(hex.decodeHex())
453
+ skip(prefixLength.toLong())
454
+ assertEquals(expectedString, readString())
455
+ }
456
+
339
457
private fun assertStringEncoded (hex : String , string : String ) {
340
458
val expectedUtf8 = hex.decodeHex()
341
459
0 commit comments