Skip to content

Commit 3d2f4bd

Browse files
committed
Make functions to read and write code points public
Fixes #307
1 parent 0431af5 commit 3d2f4bd

File tree

6 files changed

+118
-42
lines changed

6 files changed

+118
-42
lines changed

core/api/kotlinx-io-core.api

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,14 @@ public final class kotlinx/io/SourcesKt {
187187
}
188188

189189
public final class kotlinx/io/Utf8Kt {
190+
public static final fun readCodePointValue (Lkotlinx/io/Source;)I
190191
public static final fun readLine (Lkotlinx/io/Source;)Ljava/lang/String;
191192
public static final fun readLineStrict (Lkotlinx/io/Source;J)Ljava/lang/String;
192193
public static synthetic fun readLineStrict$default (Lkotlinx/io/Source;JILjava/lang/Object;)Ljava/lang/String;
193194
public static final fun readString (Lkotlinx/io/Buffer;)Ljava/lang/String;
194195
public static final fun readString (Lkotlinx/io/Source;)Ljava/lang/String;
195196
public static final fun readString (Lkotlinx/io/Source;J)Ljava/lang/String;
197+
public static final fun writeCodePointValue (Lkotlinx/io/Sink;I)V
196198
public static final fun writeString (Lkotlinx/io/Sink;Ljava/lang/String;II)V
197199
public static synthetic fun writeString$default (Lkotlinx/io/Sink;Ljava/lang/String;IIILjava/lang/Object;)V
198200
}

core/api/kotlinx-io-core.klib.api

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ final fun (kotlinx.io/Buffer).kotlinx.io/snapshot(): kotlinx.io.bytestring/ByteS
8282
final fun (kotlinx.io/RawSink).kotlinx.io/buffered(): kotlinx.io/Sink // kotlinx.io/buffered|[email protected](){}[0]
8383
final fun (kotlinx.io/RawSource).kotlinx.io/buffered(): kotlinx.io/Source // kotlinx.io/buffered|[email protected](){}[0]
8484
final fun (kotlinx.io/Sink).kotlinx.io/write(kotlinx.io.bytestring/ByteString, kotlin/Int =..., kotlin/Int =...) // kotlinx.io/write|[email protected](kotlinx.io.bytestring.ByteString;kotlin.Int;kotlin.Int){}[0]
85+
final fun (kotlinx.io/Sink).kotlinx.io/writeCodePointValue(kotlin/Int) // kotlinx.io/writeCodePointValue|[email protected](kotlin.Int){}[0]
8586
final fun (kotlinx.io/Sink).kotlinx.io/writeDecimalLong(kotlin/Long) // kotlinx.io/writeDecimalLong|[email protected](kotlin.Long){}[0]
8687
final fun (kotlinx.io/Sink).kotlinx.io/writeDouble(kotlin/Double) // kotlinx.io/writeDouble|[email protected](kotlin.Double){}[0]
8788
final fun (kotlinx.io/Sink).kotlinx.io/writeDoubleLe(kotlin/Double) // kotlinx.io/writeDoubleLe|[email protected](kotlin.Double){}[0]
@@ -105,6 +106,7 @@ final fun (kotlinx.io/Source).kotlinx.io/readByteArray(): kotlin/ByteArray // ko
105106
final fun (kotlinx.io/Source).kotlinx.io/readByteArray(kotlin/Int): kotlin/ByteArray // kotlinx.io/readByteArray|[email protected](kotlin.Int){}[0]
106107
final fun (kotlinx.io/Source).kotlinx.io/readByteString(): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|[email protected](){}[0]
107108
final fun (kotlinx.io/Source).kotlinx.io/readByteString(kotlin/Int): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|[email protected](kotlin.Int){}[0]
109+
final fun (kotlinx.io/Source).kotlinx.io/readCodePointValue(): kotlin/Int // kotlinx.io/readCodePointValue|[email protected](){}[0]
108110
final fun (kotlinx.io/Source).kotlinx.io/readDecimalLong(): kotlin/Long // kotlinx.io/readDecimalLong|[email protected](){}[0]
109111
final fun (kotlinx.io/Source).kotlinx.io/readDouble(): kotlin/Double // kotlinx.io/readDouble|[email protected](){}[0]
110112
final fun (kotlinx.io/Source).kotlinx.io/readDoubleLe(): kotlin/Double // kotlinx.io/readDoubleLe|[email protected](){}[0]

core/common/src/Utf8.kt

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,23 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
122122
/**
123123
* Encodes [codePoint] in UTF-8 and writes it to this sink.
124124
*
125+
* Note that in general, a value retrieved from [Char.code] could not be written directly
126+
* as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
127+
* detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
128+
* Such a pair of characters needs to be manually converted back to a single code point
129+
* which then could be written to a [Sink].
130+
* Without such a conversion, data written to a [Sink] will no
131+
* longer be converted back to a string from which a surrogate pair was retrieved.
132+
*
125133
* @param codePoint the codePoint to be written.
126134
*
127135
* @throws IllegalStateException when the sink is closed.
128136
*
129-
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.utf8CodePointSample
137+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
138+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
130139
*/
131140
@OptIn(DelicateIoApi::class)
132-
internal fun Sink.writeUtf8CodePoint(codePoint: Int): Unit =
141+
public fun Sink.writeCodePointValue(codePoint: Int): Unit =
133142
writeToInternalBuffer { it.commonWriteUtf8CodePoint(codePoint) }
134143

135144
/**
@@ -196,24 +205,29 @@ public fun Source.readString(byteCount: Long): String {
196205
}
197206

198207
/**
199-
* Removes and returns a single UTF-8 code point, reading between 1 and 4 bytes as necessary.
208+
* Decodes a single code point value from UTF-8 code units, reading between 1 and 4 bytes as necessary.
200209
*
201210
* If this source is exhausted before a complete code point can be read, this throws an
202211
* [EOFException] and consumes no input.
203212
*
204-
* If this source doesn't start with a properly-encoded UTF-8 code point, this method will remove
213+
* If this source doesn't start with a properly encoded UTF-8 code point, this method will remove
205214
* 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`). This covers encoding
206-
* problems (the input is not properly-encoded UTF-8), characters out of range (beyond the
215+
* problems (the input is not properly encoded UTF-8), characters out of range (beyond the
207216
* `0x10ffff` limit of Unicode), code points for UTF-16 surrogates (`U+d800`..`U+dfff`) and overlong
208217
* encodings (such as `0xc080` for the NUL character in modified UTF-8).
209218
*
219+
* Note that in general, returned value may not be directly converted to [Char] as it may be out
220+
* of [Char]'s values range and should be manually converted to a
221+
* [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2).
222+
*
210223
* @throws EOFException when the source is exhausted before a complete code point can be read.
211224
* @throws IllegalStateException when the source is closed.
212225
*
213226
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8CodePointSample
227+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.surrogatePairs
214228
*/
215229
@OptIn(InternalIoApi::class)
216-
internal fun Source.readUtf8CodePoint(): Int {
230+
public fun Source.readCodePointValue(): Int {
217231
require(1)
218232

219233
val b0 = buffer[0].toInt()
@@ -226,13 +240,6 @@ internal fun Source.readUtf8CodePoint(): Int {
226240
return buffer.commonReadUtf8CodePoint()
227241
}
228242

229-
/**
230-
* @see Source.readUtf8CodePoint
231-
*/
232-
internal fun Buffer.readUtf8CodePoint(): Int {
233-
return this.commonReadUtf8CodePoint()
234-
}
235-
236243
/**
237244
* Removes and returns UTF-8 encoded characters up to but not including the next line break. A line break is
238245
* either `"\n"` or `"\r\n"`; these characters are not included in the result.

core/common/test/AbstractSourceTest.kt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,25 +1099,25 @@ abstract class AbstractBufferedSourceTest internal constructor(
10991099
with(sink) {
11001100
writeByte(0x7f)
11011101
emit()
1102-
assertEquals(0x7f, source.readUtf8CodePoint().toLong())
1102+
assertEquals(0x7f, source.readCodePointValue().toLong())
11031103

11041104
writeByte(0xdf.toByte())
11051105
writeByte(0xbf.toByte())
11061106
emit()
1107-
assertEquals(0x07ff, source.readUtf8CodePoint().toLong())
1107+
assertEquals(0x07ff, source.readCodePointValue().toLong())
11081108

11091109
writeByte(0xef.toByte())
11101110
writeByte(0xbf.toByte())
11111111
writeByte(0xbf.toByte())
11121112
emit()
1113-
assertEquals(0xffff, source.readUtf8CodePoint().toLong())
1113+
assertEquals(0xffff, source.readCodePointValue().toLong())
11141114

11151115
writeByte(0xf4.toByte())
11161116
writeByte(0x8f.toByte())
11171117
writeByte(0xbf.toByte())
11181118
writeByte(0xbf.toByte())
11191119
emit()
1120-
assertEquals(0x10ffff, source.readUtf8CodePoint().toLong())
1120+
assertEquals(0x10ffff, source.readCodePointValue().toLong())
11211121
}
11221122
}
11231123

@@ -1126,20 +1126,20 @@ abstract class AbstractBufferedSourceTest internal constructor(
11261126
with(sink) {
11271127
writeByte(0xdf.toByte()) // a second byte is missing
11281128
emit()
1129-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1129+
assertFailsWith<EOFException> { source.readCodePointValue() }
11301130
assertEquals(1, source.readByteArray().size)
11311131

11321132
writeByte(0xe2.toByte())
11331133
writeByte(0x98.toByte()) // a third byte is missing
11341134
emit()
1135-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1135+
assertFailsWith<EOFException> { source.readCodePointValue() }
11361136
assertEquals(2, source.readByteArray().size)
11371137

11381138
writeByte(0xf0.toByte())
11391139
writeByte(0x9f.toByte())
11401140
writeByte(0x92.toByte()) // a forth byte is missing
11411141
emit()
1142-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1142+
assertFailsWith<EOFException> { source.readCodePointValue() }
11431143
assertEquals(3, source.readByteArray().size)
11441144
}
11451145
}

core/common/test/Utf8Test.kt

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -285,22 +285,22 @@ class Utf8Test {
285285
@Test
286286
fun readCodePointFromEmptyBufferThrowsEofException() {
287287
val buffer = Buffer()
288-
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
288+
assertFailsWith<EOFException> { buffer.readCodePointValue() }
289289
}
290290

291291
@Test
292292
fun readLeadingContinuationByteReturnsReplacementCharacter() {
293293
val buffer = Buffer()
294294
buffer.writeByte(0xbf.toByte())
295-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
295+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
296296
assertTrue(buffer.exhausted())
297297
}
298298

299299
@Test
300300
fun readMissingContinuationBytesThrowsEofException() {
301301
val buffer = Buffer()
302302
buffer.writeByte(0xdf.toByte())
303-
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
303+
assertFailsWith<EOFException> { buffer.readCodePointValue() }
304304
assertFalse(buffer.exhausted()) // Prefix byte wasn't consumed.
305305
}
306306

@@ -309,11 +309,11 @@ class Utf8Test {
309309
// 5-byte and 6-byte code points are not supported.
310310
val buffer = Buffer()
311311
buffer.write("f888808080".decodeHex())
312-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
313-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
314-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
315-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
316-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
312+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
313+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
314+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
315+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
316+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
317317
assertTrue(buffer.exhausted())
318318

319319
buffer.write(ByteArray(Segment.SIZE - 2))
@@ -332,8 +332,8 @@ class Utf8Test {
332332
// Use a non-continuation byte where a continuation byte is expected.
333333
val buffer = Buffer()
334334
buffer.write("df20".decodeHex())
335-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
336-
assertEquals(0x20, buffer.readUtf8CodePoint()) // Non-continuation character not consumed.
335+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
336+
assertEquals(0x20, buffer.readCodePointValue()) // Non-continuation character not consumed.
337337
assertTrue(buffer.exhausted())
338338
}
339339

@@ -342,18 +342,18 @@ class Utf8Test {
342342
// A 4-byte encoding with data above the U+10ffff Unicode maximum.
343343
val buffer = Buffer()
344344
buffer.write("f4908080".decodeHex())
345-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
345+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
346346
assertTrue(buffer.exhausted())
347347
}
348348

349349
@Test
350350
fun readSurrogateCodePoint() {
351351
val buffer = Buffer()
352352
buffer.write("eda080".decodeHex())
353-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
353+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
354354
assertTrue(buffer.exhausted())
355355
buffer.write("edbfbf".decodeHex())
356-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
356+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
357357
assertTrue(buffer.exhausted())
358358
}
359359

@@ -362,15 +362,15 @@ class Utf8Test {
362362
// Use 2 bytes to encode data that only needs 1 byte.
363363
val buffer = Buffer()
364364
buffer.write("c080".decodeHex())
365-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
365+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
366366
assertTrue(buffer.exhausted())
367367
}
368368

369369
@Test
370370
fun writeCodePointBeyondUnicodeMaximum() {
371371
val buffer = Buffer()
372372
assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
373-
buffer.writeUtf8CodePoint(0x110000)
373+
buffer.writeCodePointValue(0x110000)
374374
}
375375
}
376376

@@ -428,7 +428,7 @@ class Utf8Test {
428428

429429
private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int, prefixLength: Int = 0) {
430430
write(ByteArray(prefixLength))
431-
writeUtf8CodePoint(codePoint)
431+
writeCodePointValue(codePoint)
432432
skip(prefixLength.toLong())
433433
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
434434
}
@@ -437,7 +437,7 @@ class Utf8Test {
437437
write(ByteArray(prefixLength))
438438
write(hex.decodeHex())
439439
skip(prefixLength.toLong())
440-
assertEquals(expectedCodePoint, readUtf8CodePoint())
440+
assertEquals(expectedCodePoint, readCodePointValue())
441441
}
442442

443443
private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String, prefixLength: Int = 0) {
@@ -469,7 +469,7 @@ class Utf8Test {
469469
val bufferUtf8 = Buffer()
470470
for (charIdx in string.indices) {
471471
val c = string[charIdx]
472-
bufferUtf8.writeUtf8CodePoint(c.code)
472+
bufferUtf8.writeCodePointValue(c.code)
473473
}
474474
assertArrayEquals(expectedUtf8, bufferUtf8.readByteArray())
475475

core/common/test/samples/samples.kt

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,19 +100,84 @@ class KotlinxIoCoreCommonSamples {
100100
fun writeUtf8CodePointSample() {
101101
val buffer = Buffer()
102102

103-
buffer.writeInt('Δ'.code) // writes integer value as is
104-
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
103+
// Basic Latin (a.k.a. ASCII) characters are encoded with a single byte
104+
buffer.writeCodePointValue('Y'.code)
105+
assertContentEquals(byteArrayOf(0x59), buffer.readByteArray())
105106

106-
buffer.writeUtf8CodePoint('Δ'.code) // encodes code point using UTF-8 encoding
107+
// wider characters are encoded into multiple UTF-8 code units
108+
buffer.writeCodePointValue('Δ'.code)
107109
assertContentEquals(byteArrayOf(0xce.toByte(), 0x94.toByte()), buffer.readByteArray())
110+
111+
// note the difference: writeInt won't encode the code point, like writeCodePointValue did
112+
buffer.writeInt('Δ'.code)
113+
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
114+
}
115+
116+
@Test
117+
fun writeSurrogatePair() {
118+
val buffer = Buffer()
119+
120+
// U+1F31E (a.k.a. "sun with face") is too wide to fit in a single UTF-16 character,
121+
// so it's represented using a surrogate pair.
122+
val chars = "🌞".toCharArray()
123+
assertEquals(2, chars.size)
124+
125+
// such a pair has to be manually converted to a single code point
126+
assertTrue(chars[0].isHighSurrogate())
127+
assertTrue(chars[1].isLowSurrogate())
128+
129+
val highSurrogate = chars[0].code
130+
val lowSurrogate = chars[1].code
131+
132+
// see https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
133+
val codePoint = 0x10000 + (highSurrogate - 0xD800).shl(10).or(lowSurrogate - 0xDC00)
134+
assertEquals(0x1F31E, codePoint)
135+
136+
// now we can write the code point
137+
buffer.writeCodePointValue(codePoint)
138+
// and read the correct string back
139+
assertEquals("🌞", buffer.readString())
140+
141+
// we won't achieve that by writing surrogates as it is
142+
buffer.apply {
143+
writeCodePointValue(highSurrogate)
144+
writeCodePointValue(lowSurrogate)
145+
}
146+
assertNotEquals("🌞", buffer.readString())
108147
}
109148

110149
@Test
111150
fun readUtf8CodePointSample() {
112151
val buffer = Buffer()
113152

114153
buffer.writeUShort(0xce94U)
115-
assertEquals(0x394, buffer.readUtf8CodePoint()) // decodes single UTF-8 encoded code point
154+
assertEquals(0x394, buffer.readCodePointValue()) // decodes a single UTF-8 encoded code point
155+
}
156+
157+
@Test
158+
fun surrogatePairs() {
159+
val buffer = Buffer()
160+
161+
// that's a U+1F31A, a.k.a. "new moon with face"
162+
buffer.writeString("🌚")
163+
// it should be encoded with 4 code units
164+
assertEquals(4, buffer.size)
165+
166+
// let's read it back as a single code point
167+
val moonCodePoint = buffer.readCodePointValue()
168+
// all code units were consumed
169+
assertEquals(0, buffer.size)
170+
171+
// the moon is too wide to fit in a single UTF-16 character!
172+
assertNotEquals(moonCodePoint, moonCodePoint.toChar().code)
173+
// "too wide" means in the [U+010000, U+10FFFF] range
174+
assertTrue(moonCodePoint in 0x10000..0x10FFFF)
175+
176+
// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
177+
val highSurrogate = (0xD800 + (moonCodePoint - 0x10000).ushr(10)).toChar()
178+
val lowSurrogate = (0xDC00 + (moonCodePoint - 0x10000).and(0x3FF)).toChar()
179+
180+
assertContentEquals(charArrayOf(highSurrogate, lowSurrogate), "🌚".toCharArray())
116181
}
117182

118183
@Test

0 commit comments

Comments
 (0)