Skip to content

Commit 0431af5

Browse files
authored
Improve test coverage (#290)
Added tests and moved code used only in tests to test source sets
1 parent a04ef0d commit 0431af5

File tree

14 files changed

+268
-141
lines changed

14 files changed

+268
-141
lines changed

core/common/src/-CommonPlatform.kt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121

2222
package kotlinx.io
2323

24-
internal expect fun String.asUtf8ToByteArray(): ByteArray
25-
2624
/**
2725
* Signals about a general issue occurred during I/O operation.
2826
*/

core/common/src/Sinks.kt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
package kotlinx.io
77

8-
internal val HEX_DIGIT_BYTES = "0123456789abcdef".asUtf8ToByteArray()
8+
private val HEX_DIGIT_BYTES = ByteArray(16) {
9+
((if (it < 10) '0'.code else ('a'.code - 10)) + it).toByte()
10+
}
911

1012
/**
1113
* Writes two bytes containing [short], in the little-endian order, to this sink.
@@ -365,4 +367,4 @@ public fun Sink.writeDoubleLe(double: Double) {
365367
public inline fun Sink.writeToInternalBuffer(lambda: (Buffer) -> Unit) {
366368
lambda(this.buffer)
367369
this.hintEmit()
368-
}
370+
}

core/common/src/internal/-Utf8.kt

Lines changed: 0 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -38,26 +38,6 @@ internal fun ByteArray.commonToUtf8String(beginIndex: Int = 0, endIndex: Int = s
3838
return chars.concatToString(0, length)
3939
}
4040

41-
internal fun String.commonAsUtf8ToByteArray(): ByteArray {
42-
val bytes = ByteArray(4 * length)
43-
44-
// Assume ASCII until a UTF-8 code point is observed. This is ugly but yields
45-
// about a 2x performance increase for pure ASCII.
46-
for (index in indices) {
47-
val b0 = this[index]
48-
if (b0 >= '\u0080') {
49-
var size = index
50-
processUtf8Bytes(index, length) { c ->
51-
bytes[size++] = c
52-
}
53-
return bytes.copyOf(size)
54-
}
55-
bytes[index] = b0.code.toByte()
56-
}
57-
58-
return bytes.copyOf(length)
59-
}
60-
6141
internal const val REPLACEMENT_BYTE: Byte = '?'.code.toByte()
6242
internal const val REPLACEMENT_CHARACTER: Char = '\ufffd'
6343
internal const val REPLACEMENT_CODE_POINT: Int = REPLACEMENT_CHARACTER.code
@@ -72,77 +52,6 @@ internal inline fun isUtf8Continuation(byte: Byte): Boolean {
7252
return byte and 0xc0 == 0x80
7353
}
7454

75-
internal inline fun String.processUtf8Bytes(
76-
beginIndex: Int,
77-
endIndex: Int,
78-
yield: (Byte) -> Unit
79-
) {
80-
// Transcode a UTF-16 String to UTF-8 bytes.
81-
var index = beginIndex
82-
while (index < endIndex) {
83-
val c = this[index]
84-
85-
when {
86-
c < '\u0080' -> {
87-
// Emit a 7-bit character with 1 byte.
88-
yield(c.code.toByte()) // 0xxxxxxx
89-
index++
90-
91-
// Assume there is going to be more ASCII
92-
while (index < endIndex && this[index] < '\u0080') {
93-
yield(this[index++].code.toByte())
94-
}
95-
}
96-
97-
c < '\u0800' -> {
98-
// Emit a 11-bit character with 2 bytes.
99-
/* ktlint-disable no-multi-spaces */
100-
yield((c.code shr 6 or 0xc0).toByte()) // 110xxxxx
101-
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
102-
/* ktlint-enable no-multi-spaces */
103-
index++
104-
}
105-
106-
c !in '\ud800'..'\udfff' -> {
107-
// Emit a 16-bit character with 3 bytes.
108-
/* ktlint-disable no-multi-spaces */
109-
yield((c.code shr 12 or 0xe0).toByte()) // 1110xxxx
110-
yield((c.code shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
111-
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
112-
/* ktlint-enable no-multi-spaces */
113-
index++
114-
}
115-
116-
else -> {
117-
// c is a surrogate. Make sure it is a high surrogate & that its successor is a low
118-
// surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement
119-
// byte.
120-
if (c > '\udbff' ||
121-
endIndex <= index + 1 ||
122-
this[index + 1] !in '\udc00'..'\udfff'
123-
) {
124-
yield(REPLACEMENT_BYTE)
125-
index++
126-
} else {
127-
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
128-
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
129-
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
130-
val codePoint = (((c.code shl 10) + this[index + 1].code) + (0x010000 - (0xd800 shl 10) - 0xdc00))
131-
132-
// Emit a 21-bit character with 4 bytes.
133-
/* ktlint-disable no-multi-spaces */
134-
yield((codePoint shr 18 or 0xf0).toByte()) // 11110xxx
135-
yield((codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
136-
yield((codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
137-
yield((codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
138-
/* ktlint-enable no-multi-spaces */
139-
index += 2
140-
}
141-
}
142-
}
143-
}
144-
}
145-
14655
internal inline fun ByteArray.processUtf8CodePoints(
14756
beginIndex: Int,
14857
endIndex: Int,

core/common/test/AbstractSourceTest.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,7 @@ abstract class AbstractBufferedSourceTest internal constructor(
615615
val string = "abcd" + "e".repeat(Segment.SIZE)
616616
sink.writeString(string)
617617
sink.emit()
618-
assertArrayEquals(string.asUtf8ToByteArray(), source.readByteArray())
618+
assertArrayEquals(string.commonAsUtf8ToByteArray(), source.readByteArray())
619619
}
620620

621621
@Test

core/common/test/Utf8Test.kt

Lines changed: 149 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@
2121

2222
package kotlinx.io
2323

24+
import kotlinx.io.internal.REPLACEMENT_CHARACTER
2425
import kotlinx.io.internal.REPLACEMENT_CODE_POINT
25-
import kotlinx.io.internal.commonAsUtf8ToByteArray
2626
import kotlinx.io.internal.processUtf8CodePoints
2727
import kotlin.test.*
2828

@@ -144,52 +144,109 @@ class Utf8Test {
144144

145145
@Test
146146
fun bufferWriteCodePoints() {
147+
bufferWriteCodePointsCheck(0)
148+
}
149+
150+
@Test
151+
fun bufferWriteCodePointsCrossSegments() {
152+
bufferWriteCodePointsCheck(Segment.SIZE - 1)
153+
}
154+
155+
private fun bufferWriteCodePointsCheck(prefixLength: Int) {
147156
val buffer = Buffer()
148-
buffer.assertCodePointEncoded("40", '@'.code)
149-
buffer.assertCodePointEncoded("7f", '\u007f'.code)
150-
buffer.assertCodePointEncoded("c280", '\u0080'.code)
151-
buffer.assertCodePointEncoded("c2a9", '\u00a9'.code)
152-
buffer.assertCodePointEncoded("c3bf", '\u00ff'.code)
153-
buffer.assertCodePointEncoded("dfbf", '\u07ff'.code)
154-
buffer.assertCodePointEncoded("e0a080", '\u0800'.code)
155-
buffer.assertCodePointEncoded("e1839a", '\u10da'.code)
156-
buffer.assertCodePointEncoded("efbfbf", '\uffff'.code)
157-
buffer.assertCodePointEncoded("f0908080", 0x10000)
158-
buffer.assertCodePointEncoded("f48087bf", 0x1001FF)
157+
buffer.assertCodePointEncoded("40", '@'.code, prefixLength)
158+
buffer.assertCodePointEncoded("7f", '\u007f'.code, prefixLength)
159+
buffer.assertCodePointEncoded("c280", '\u0080'.code, prefixLength)
160+
buffer.assertCodePointEncoded("c2a9", '\u00a9'.code, prefixLength)
161+
buffer.assertCodePointEncoded("c3bf", '\u00ff'.code, prefixLength)
162+
buffer.assertCodePointEncoded("dfbf", '\u07ff'.code, prefixLength)
163+
buffer.assertCodePointEncoded("e0a080", '\u0800'.code, prefixLength)
164+
buffer.assertCodePointEncoded("e1839a", '\u10da'.code, prefixLength)
165+
buffer.assertCodePointEncoded("efbfbf", '\uffff'.code, prefixLength)
166+
buffer.assertCodePointEncoded("f0908080", 0x10000, prefixLength)
167+
buffer.assertCodePointEncoded("f48087bf", 0x1001FF, prefixLength)
159168
}
160169

161170
@Test
162171
fun bufferReadCodePoints() {
172+
bufferReadCodePointsCheck(0)
173+
}
174+
175+
@Test
176+
fun bufferReadCodePointsCrossSegments() {
177+
bufferReadCodePointsCheck(Segment.SIZE - 1)
178+
}
179+
180+
private fun bufferReadCodePointsCheck(prefixLength: Int) {
163181
val buffer = Buffer()
164-
buffer.assertCodePointDecoded('@'.code, "40")
165-
buffer.assertCodePointDecoded('\u007f'.code, "7f")
166-
buffer.assertCodePointDecoded('\u0080'.code, "c280")
167-
buffer.assertCodePointDecoded('\u00a9'.code, "c2a9")
168-
buffer.assertCodePointDecoded('\u00ff'.code, "c3bf")
169-
buffer.assertCodePointDecoded('\u07ff'.code, "dfbf")
170-
buffer.assertCodePointDecoded('\u0800'.code, "e0a080")
171-
buffer.assertCodePointDecoded('\u10da'.code, "e1839a")
172-
buffer.assertCodePointDecoded('\uffff'.code, "efbfbf")
173-
buffer.assertCodePointDecoded(0x10000, "f0908080")
174-
buffer.assertCodePointDecoded(0x1001FF, "f48087bf")
182+
buffer.assertCodePointDecoded('@'.code, "40", prefixLength)
183+
buffer.assertCodePointDecoded('\u007f'.code, "7f", prefixLength)
184+
buffer.assertCodePointDecoded('\u0080'.code, "c280", prefixLength)
185+
buffer.assertCodePointDecoded('\u00a9'.code, "c2a9", prefixLength)
186+
buffer.assertCodePointDecoded('\u00ff'.code, "c3bf", prefixLength)
187+
buffer.assertCodePointDecoded('\u07ff'.code, "dfbf", prefixLength)
188+
buffer.assertCodePointDecoded('\u0800'.code, "e0a080", prefixLength)
189+
buffer.assertCodePointDecoded('\u10da'.code, "e1839a", prefixLength)
190+
buffer.assertCodePointDecoded('\uffff'.code, "efbfbf", prefixLength)
191+
buffer.assertCodePointDecoded(0x10000, "f0908080", prefixLength)
192+
buffer.assertCodePointDecoded(0x1001FF, "f48087bf", prefixLength)
175193
}
176194

177195
@Test
178196
fun bufferWriteUtf8String() {
197+
bufferWriteUtf8StringCheck(0)
198+
}
199+
200+
@Test
201+
fun bufferWriteUtf8StringCrossSegments() {
202+
bufferWriteUtf8StringCheck(Segment.SIZE - 1)
203+
}
204+
205+
private fun bufferWriteUtf8StringCheck(prefixLength: Int) {
179206
val buffer = Buffer()
180-
buffer.assertUtf8StringEncoded("68656c6c6f", "hello")
181-
buffer.assertUtf8StringEncoded("cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82", "χερετισμός")
207+
buffer.assertUtf8StringEncoded("68656c6c6f", "hello", prefixLength)
208+
buffer.assertUtf8StringEncoded("cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82", "χερετισμός",
209+
prefixLength)
182210
buffer.assertUtf8StringEncoded(
183211
"e18392e18390e1839be18390e183a0e183afe1839de18391e18390",
184-
"გამარჯობა"
212+
"გამარჯობა",
213+
prefixLength
185214
)
186215
buffer.assertUtf8StringEncoded(
187216
"f093878bf0938bb4f09380a5",
188-
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25" /* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
217+
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25",/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
218+
prefixLength
189219
)
190220

191221
// two consecutive high surrogates, replace with '?'
192-
buffer.assertUtf8StringEncoded("3f3f", "\ud801\uD801")
222+
buffer.assertUtf8StringEncoded("3f3f", "\ud801\uD801", prefixLength)
223+
}
224+
225+
@Test
226+
fun bufferReadUtf8String() {
227+
bufferReadUtf8StringCheck(0)
228+
}
229+
230+
@Test
231+
fun bufferReadUtf8StringCrossSegments() {
232+
bufferReadUtf8StringCheck(Segment.SIZE - 1)
233+
}
234+
235+
private fun bufferReadUtf8StringCheck(prefixLength: Int) {
236+
val buffer = Buffer()
237+
buffer.assertUtf8StringDecoded("hello","68656c6c6f", prefixLength)
238+
buffer.assertUtf8StringDecoded("χερετισμός", "cf87ceb5cf81ceb5cf84ceb9cf83cebccf8ccf82",
239+
prefixLength)
240+
buffer.assertUtf8StringDecoded(
241+
"გამარჯობა",
242+
"e18392e18390e1839be18390e183a0e183afe1839de18391e18390",
243+
prefixLength
244+
)
245+
buffer.assertUtf8StringDecoded(
246+
"\uD80C\uDDCB\uD80C\uDEF4\uD80C\uDC25",/* 𓇋𓋴𓀥, to hail, AN EGYPTIAN HIEROGLYPHIC DICTIONARY, p. 79b */
247+
"f093878bf0938bb4f09380a5",
248+
prefixLength
249+
)
193250
}
194251

195252
@Test
@@ -258,6 +315,16 @@ class Utf8Test {
258315
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
259316
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
260317
assertTrue(buffer.exhausted())
318+
319+
buffer.write(ByteArray(Segment.SIZE - 2))
320+
buffer.write("f888808080".decodeHex())
321+
buffer.skip(Segment.SIZE - 2L)
322+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
323+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
324+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
325+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
326+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
327+
assertTrue(buffer.exhausted())
261328
}
262329

263330
@Test
@@ -307,6 +374,44 @@ class Utf8Test {
307374
}
308375
}
309376

377+
@Test
378+
fun readStringWithUnderflow() {
379+
val buffer = Buffer()
380+
// 3 byte-encoded, last byte missing
381+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "e183")
382+
// 3 byte-encoded, last two bytes missing
383+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "e1")
384+
// 2 byte-encoded, last byte missing
385+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "cf")
386+
// 4 byte encoded, various underflows
387+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f09383")
388+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f093")
389+
buffer.assertUtf8StringDecoded(REPLACEMENT_CHARACTER.toString(), "f0")
390+
}
391+
392+
@Test
393+
fun readStringWithoutContinuationByte() {
394+
val buffer = Buffer()
395+
// 2 byte-encoded, last byte corrupted
396+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "cf61")
397+
// 3 byte-encoded, last byte corrupted
398+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "e18361")
399+
// 3 byte-encoded, last two bytes corrupted
400+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aa", "e16161")
401+
// 4 byte-encoded, various bytes corrupterd
402+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}a", "f0938361")
403+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aa", "f0936161")
404+
buffer.assertUtf8StringDecoded("${REPLACEMENT_CHARACTER}aaa", "f0616161")
405+
}
406+
407+
@OptIn(ExperimentalStdlibApi::class)
408+
@Test
409+
fun encodeUtf16SurrogatePair() {
410+
val buffer = Buffer()
411+
buffer.writeString("\uD852\uDF62")
412+
println(buffer.readByteArray().toHexString())
413+
}
414+
310415
private fun assertEncoded(hex: String, vararg codePoints: Int) {
311416
assertCodePointDecoded(hex, *codePoints)
312417
}
@@ -321,21 +426,34 @@ class Utf8Test {
321426
assertEquals(i, codePoints.size) // Checked them all
322427
}
323428

324-
private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int) {
429+
private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int, prefixLength: Int = 0) {
430+
write(ByteArray(prefixLength))
325431
writeUtf8CodePoint(codePoint)
432+
skip(prefixLength.toLong())
326433
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
327434
}
328435

329-
private fun Buffer.assertCodePointDecoded(expectedCodePoint: Int, hex: String) {
436+
private fun Buffer.assertCodePointDecoded(expectedCodePoint: Int, hex: String, prefixLength: Int = 0) {
437+
write(ByteArray(prefixLength))
330438
write(hex.decodeHex())
439+
skip(prefixLength.toLong())
331440
assertEquals(expectedCodePoint, readUtf8CodePoint())
332441
}
333442

334-
private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String) {
443+
private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String, prefixLength: Int = 0) {
444+
write(ByteArray(prefixLength))
335445
writeString(string)
446+
skip(prefixLength.toLong())
336447
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
337448
}
338449

450+
private fun Buffer.assertUtf8StringDecoded(expectedString: String, hex: String, prefixLength: Int = 0) {
451+
write(ByteArray(prefixLength))
452+
write(hex.decodeHex())
453+
skip(prefixLength.toLong())
454+
assertEquals(expectedString, readString())
455+
}
456+
339457
private fun assertStringEncoded(hex: String, string: String) {
340458
val expectedUtf8 = hex.decodeHex()
341459

0 commit comments

Comments
 (0)