Skip to content

Commit eb68816

Browse files
committed
Make functions to read and write code points public
Fixes #307
1 parent a04ef0d commit eb68816

File tree

6 files changed

+118
-42
lines changed

6 files changed

+118
-42
lines changed

core/api/kotlinx-io-core.api

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,14 @@ public final class kotlinx/io/SourcesKt {
187187
}
188188

189189
public final class kotlinx/io/Utf8Kt {
190+
public static final fun readCodePointValue (Lkotlinx/io/Source;)I
190191
public static final fun readLine (Lkotlinx/io/Source;)Ljava/lang/String;
191192
public static final fun readLineStrict (Lkotlinx/io/Source;J)Ljava/lang/String;
192193
public static synthetic fun readLineStrict$default (Lkotlinx/io/Source;JILjava/lang/Object;)Ljava/lang/String;
193194
public static final fun readString (Lkotlinx/io/Buffer;)Ljava/lang/String;
194195
public static final fun readString (Lkotlinx/io/Source;)Ljava/lang/String;
195196
public static final fun readString (Lkotlinx/io/Source;J)Ljava/lang/String;
197+
public static final fun writeCodePointValue (Lkotlinx/io/Sink;I)V
196198
public static final fun writeString (Lkotlinx/io/Sink;Ljava/lang/String;II)V
197199
public static synthetic fun writeString$default (Lkotlinx/io/Sink;Ljava/lang/String;IIILjava/lang/Object;)V
198200
}

core/api/kotlinx-io-core.klib.api

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ final fun (kotlinx.io/Buffer).kotlinx.io/snapshot(): kotlinx.io.bytestring/ByteS
8282
final fun (kotlinx.io/RawSink).kotlinx.io/buffered(): kotlinx.io/Sink // kotlinx.io/buffered|[email protected](){}[0]
8383
final fun (kotlinx.io/RawSource).kotlinx.io/buffered(): kotlinx.io/Source // kotlinx.io/buffered|[email protected](){}[0]
8484
final fun (kotlinx.io/Sink).kotlinx.io/write(kotlinx.io.bytestring/ByteString, kotlin/Int =..., kotlin/Int =...) // kotlinx.io/write|[email protected](kotlinx.io.bytestring.ByteString;kotlin.Int;kotlin.Int){}[0]
85+
final fun (kotlinx.io/Sink).kotlinx.io/writeCodePointValue(kotlin/Int) // kotlinx.io/writeCodePointValue|[email protected](kotlin.Int){}[0]
8586
final fun (kotlinx.io/Sink).kotlinx.io/writeDecimalLong(kotlin/Long) // kotlinx.io/writeDecimalLong|[email protected](kotlin.Long){}[0]
8687
final fun (kotlinx.io/Sink).kotlinx.io/writeDouble(kotlin/Double) // kotlinx.io/writeDouble|[email protected](kotlin.Double){}[0]
8788
final fun (kotlinx.io/Sink).kotlinx.io/writeDoubleLe(kotlin/Double) // kotlinx.io/writeDoubleLe|[email protected](kotlin.Double){}[0]
@@ -105,6 +106,7 @@ final fun (kotlinx.io/Source).kotlinx.io/readByteArray(): kotlin/ByteArray // ko
105106
final fun (kotlinx.io/Source).kotlinx.io/readByteArray(kotlin/Int): kotlin/ByteArray // kotlinx.io/readByteArray|[email protected](kotlin.Int){}[0]
106107
final fun (kotlinx.io/Source).kotlinx.io/readByteString(): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|[email protected](){}[0]
107108
final fun (kotlinx.io/Source).kotlinx.io/readByteString(kotlin/Int): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|[email protected](kotlin.Int){}[0]
109+
final fun (kotlinx.io/Source).kotlinx.io/readCodePointValue(): kotlin/Int // kotlinx.io/readCodePointValue|[email protected](){}[0]
108110
final fun (kotlinx.io/Source).kotlinx.io/readDecimalLong(): kotlin/Long // kotlinx.io/readDecimalLong|[email protected](){}[0]
109111
final fun (kotlinx.io/Source).kotlinx.io/readDouble(): kotlin/Double // kotlinx.io/readDouble|[email protected](){}[0]
110112
final fun (kotlinx.io/Source).kotlinx.io/readDoubleLe(): kotlin/Double // kotlinx.io/readDoubleLe|[email protected](){}[0]

core/common/src/Utf8.kt

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -122,14 +122,23 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
122122
/**
123123
* Encodes [codePoint] in UTF-8 and writes it to this sink.
124124
*
125+
* Note that in general, a value retrieved from [Char.code] could not be written directly
126+
* as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
127+
* detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
128+
* Such a pair of characters needs to be manually converted back to a single code point
129+
* which then could be written to a [Sink].
130+
* Without such a conversion, data written to a [Sink] will no
131+
* longer be converted back to a string from which a surrogate pair was retrieved.
132+
*
125133
* @param codePoint the codePoint to be written.
126134
*
127135
* @throws IllegalStateException when the sink is closed.
128136
*
129-
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.utf8CodePointSample
137+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
138+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
130139
*/
131140
@OptIn(DelicateIoApi::class)
132-
internal fun Sink.writeUtf8CodePoint(codePoint: Int): Unit =
141+
public fun Sink.writeCodePointValue(codePoint: Int): Unit =
133142
writeToInternalBuffer { it.commonWriteUtf8CodePoint(codePoint) }
134143

135144
/**
@@ -196,24 +205,29 @@ public fun Source.readString(byteCount: Long): String {
196205
}
197206

198207
/**
199-
* Removes and returns a single UTF-8 code point, reading between 1 and 4 bytes as necessary.
208+
* Decodes a single code point value from UTF-8 code units, reading between 1 and 4 bytes as necessary.
200209
*
201210
* If this source is exhausted before a complete code point can be read, this throws an
202211
* [EOFException] and consumes no input.
203212
*
204-
* If this source doesn't start with a properly-encoded UTF-8 code point, this method will remove
213+
* If this source doesn't start with a properly encoded UTF-8 code point, this method will remove
205214
* 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`). This covers encoding
206-
* problems (the input is not properly-encoded UTF-8), characters out of range (beyond the
215+
* problems (the input is not properly encoded UTF-8), characters out of range (beyond the
207216
* `0x10ffff` limit of Unicode), code points for UTF-16 surrogates (`U+d800`..`U+dfff`) and overlong
208217
* encodings (such as `0xc080` for the NUL character in modified UTF-8).
209218
*
219+
* Note that in general, returned value may not be directly converted to [Char] as it may be out
220+
* of [Char]'s values range and should be manually converted to a
221+
* [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2).
222+
*
210223
* @throws EOFException when the source is exhausted before a complete code point can be read.
211224
* @throws IllegalStateException when the source is closed.
212225
*
213226
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8CodePointSample
227+
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.surrogatePairs
214228
*/
215229
@OptIn(InternalIoApi::class)
216-
internal fun Source.readUtf8CodePoint(): Int {
230+
public fun Source.readCodePointValue(): Int {
217231
require(1)
218232

219233
val b0 = buffer[0].toInt()
@@ -226,13 +240,6 @@ internal fun Source.readUtf8CodePoint(): Int {
226240
return buffer.commonReadUtf8CodePoint()
227241
}
228242

229-
/**
230-
* @see Source.readUtf8CodePoint
231-
*/
232-
internal fun Buffer.readUtf8CodePoint(): Int {
233-
return this.commonReadUtf8CodePoint()
234-
}
235-
236243
/**
237244
* Removes and returns UTF-8 encoded characters up to but not including the next line break. A line break is
238245
* either `"\n"` or `"\r\n"`; these characters are not included in the result.

core/common/test/AbstractSourceTest.kt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,25 +1099,25 @@ abstract class AbstractBufferedSourceTest internal constructor(
10991099
with(sink) {
11001100
writeByte(0x7f)
11011101
emit()
1102-
assertEquals(0x7f, source.readUtf8CodePoint().toLong())
1102+
assertEquals(0x7f, source.readCodePointValue().toLong())
11031103

11041104
writeByte(0xdf.toByte())
11051105
writeByte(0xbf.toByte())
11061106
emit()
1107-
assertEquals(0x07ff, source.readUtf8CodePoint().toLong())
1107+
assertEquals(0x07ff, source.readCodePointValue().toLong())
11081108

11091109
writeByte(0xef.toByte())
11101110
writeByte(0xbf.toByte())
11111111
writeByte(0xbf.toByte())
11121112
emit()
1113-
assertEquals(0xffff, source.readUtf8CodePoint().toLong())
1113+
assertEquals(0xffff, source.readCodePointValue().toLong())
11141114

11151115
writeByte(0xf4.toByte())
11161116
writeByte(0x8f.toByte())
11171117
writeByte(0xbf.toByte())
11181118
writeByte(0xbf.toByte())
11191119
emit()
1120-
assertEquals(0x10ffff, source.readUtf8CodePoint().toLong())
1120+
assertEquals(0x10ffff, source.readCodePointValue().toLong())
11211121
}
11221122
}
11231123

@@ -1126,20 +1126,20 @@ abstract class AbstractBufferedSourceTest internal constructor(
11261126
with(sink) {
11271127
writeByte(0xdf.toByte()) // a second byte is missing
11281128
emit()
1129-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1129+
assertFailsWith<EOFException> { source.readCodePointValue() }
11301130
assertEquals(1, source.readByteArray().size)
11311131

11321132
writeByte(0xe2.toByte())
11331133
writeByte(0x98.toByte()) // a third byte is missing
11341134
emit()
1135-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1135+
assertFailsWith<EOFException> { source.readCodePointValue() }
11361136
assertEquals(2, source.readByteArray().size)
11371137

11381138
writeByte(0xf0.toByte())
11391139
writeByte(0x9f.toByte())
11401140
writeByte(0x92.toByte()) // a forth byte is missing
11411141
emit()
1142-
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
1142+
assertFailsWith<EOFException> { source.readCodePointValue() }
11431143
assertEquals(3, source.readByteArray().size)
11441144
}
11451145
}

core/common/test/Utf8Test.kt

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -228,22 +228,22 @@ class Utf8Test {
228228
@Test
229229
fun readCodePointFromEmptyBufferThrowsEofException() {
230230
val buffer = Buffer()
231-
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
231+
assertFailsWith<EOFException> { buffer.readCodePointValue() }
232232
}
233233

234234
@Test
235235
fun readLeadingContinuationByteReturnsReplacementCharacter() {
236236
val buffer = Buffer()
237237
buffer.writeByte(0xbf.toByte())
238-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
238+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
239239
assertTrue(buffer.exhausted())
240240
}
241241

242242
@Test
243243
fun readMissingContinuationBytesThrowsEofException() {
244244
val buffer = Buffer()
245245
buffer.writeByte(0xdf.toByte())
246-
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
246+
assertFailsWith<EOFException> { buffer.readCodePointValue() }
247247
assertFalse(buffer.exhausted()) // Prefix byte wasn't consumed.
248248
}
249249

@@ -252,11 +252,11 @@ class Utf8Test {
252252
// 5-byte and 6-byte code points are not supported.
253253
val buffer = Buffer()
254254
buffer.write("f888808080".decodeHex())
255-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
256-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
257-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
258-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
259-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
255+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
256+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
257+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
258+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
259+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
260260
assertTrue(buffer.exhausted())
261261
}
262262

@@ -265,8 +265,8 @@ class Utf8Test {
265265
// Use a non-continuation byte where a continuation byte is expected.
266266
val buffer = Buffer()
267267
buffer.write("df20".decodeHex())
268-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
269-
assertEquals(0x20, buffer.readUtf8CodePoint()) // Non-continuation character not consumed.
268+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
269+
assertEquals(0x20, buffer.readCodePointValue()) // Non-continuation character not consumed.
270270
assertTrue(buffer.exhausted())
271271
}
272272

@@ -275,18 +275,18 @@ class Utf8Test {
275275
// A 4-byte encoding with data above the U+10ffff Unicode maximum.
276276
val buffer = Buffer()
277277
buffer.write("f4908080".decodeHex())
278-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
278+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
279279
assertTrue(buffer.exhausted())
280280
}
281281

282282
@Test
283283
fun readSurrogateCodePoint() {
284284
val buffer = Buffer()
285285
buffer.write("eda080".decodeHex())
286-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
286+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
287287
assertTrue(buffer.exhausted())
288288
buffer.write("edbfbf".decodeHex())
289-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
289+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
290290
assertTrue(buffer.exhausted())
291291
}
292292

@@ -295,15 +295,15 @@ class Utf8Test {
295295
// Use 2 bytes to encode data that only needs 1 byte.
296296
val buffer = Buffer()
297297
buffer.write("c080".decodeHex())
298-
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
298+
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
299299
assertTrue(buffer.exhausted())
300300
}
301301

302302
@Test
303303
fun writeCodePointBeyondUnicodeMaximum() {
304304
val buffer = Buffer()
305305
assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
306-
buffer.writeUtf8CodePoint(0x110000)
306+
buffer.writeCodePointValue(0x110000)
307307
}
308308
}
309309

@@ -322,13 +322,13 @@ class Utf8Test {
322322
}
323323

324324
private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int) {
325-
writeUtf8CodePoint(codePoint)
325+
writeCodePointValue(codePoint)
326326
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
327327
}
328328

329329
private fun Buffer.assertCodePointDecoded(expectedCodePoint: Int, hex: String) {
330330
write(hex.decodeHex())
331-
assertEquals(expectedCodePoint, readUtf8CodePoint())
331+
assertEquals(expectedCodePoint, readCodePointValue())
332332
}
333333

334334
private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String) {
@@ -351,7 +351,7 @@ class Utf8Test {
351351
val bufferUtf8 = Buffer()
352352
for (charIdx in string.indices) {
353353
val c = string[charIdx]
354-
bufferUtf8.writeUtf8CodePoint(c.code)
354+
bufferUtf8.writeCodePointValue(c.code)
355355
}
356356
assertArrayEquals(expectedUtf8, bufferUtf8.readByteArray())
357357

core/common/test/samples/samples.kt

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,19 +100,84 @@ class KotlinxIoCoreCommonSamples {
100100
fun writeUtf8CodePointSample() {
101101
val buffer = Buffer()
102102

103-
buffer.writeInt('Δ'.code) // writes integer value as is
104-
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
103+
// Basic Latin (a.k.a. ASCII) characters are encoded with a single byte
104+
buffer.writeCodePointValue('Y'.code)
105+
assertContentEquals(byteArrayOf(0x59), buffer.readByteArray())
105106

106-
buffer.writeUtf8CodePoint('Δ'.code) // encodes code point using UTF-8 encoding
107+
// wider characters are encoded into multiple UTF-8 code units
108+
buffer.writeCodePointValue('Δ'.code)
107109
assertContentEquals(byteArrayOf(0xce.toByte(), 0x94.toByte()), buffer.readByteArray())
110+
111+
// note the difference: writeInt won't encode the code point, like writeCodePointValue did
112+
buffer.writeInt('Δ'.code)
113+
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
114+
}
115+
116+
@Test
117+
fun writeSurrogatePair() {
118+
val buffer = Buffer()
119+
120+
// U+1F31E (a.k.a. "sun with face") is too wide to fit in a single UTF-16 character,
121+
// so it's represented using a surrogate pair.
122+
val chars = "🌞".toCharArray()
123+
assertEquals(2, chars.size)
124+
125+
// such a pair has to be manually converted to a single code point
126+
assertTrue(chars[0].isHighSurrogate())
127+
assertTrue(chars[1].isLowSurrogate())
128+
129+
val highSurrogate = chars[0].code
130+
val lowSurrogate = chars[1].code
131+
132+
// see https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
133+
val codePoint = 0x10000 + (highSurrogate - 0xD800).shl(10).or(lowSurrogate - 0xDC00)
134+
assertEquals(0x1F31E, codePoint)
135+
136+
// now we can write the code point
137+
buffer.writeCodePointValue(codePoint)
138+
// and read the correct string back
139+
assertEquals("🌞", buffer.readString())
140+
141+
// we won't achieve that by writing surrogates as it is
142+
buffer.apply {
143+
writeCodePointValue(highSurrogate)
144+
writeCodePointValue(lowSurrogate)
145+
}
146+
assertNotEquals("🌞", buffer.readString())
108147
}
109148

110149
@Test
111150
fun readUtf8CodePointSample() {
112151
val buffer = Buffer()
113152

114153
buffer.writeUShort(0xce94U)
115-
assertEquals(0x394, buffer.readUtf8CodePoint()) // decodes single UTF-8 encoded code point
154+
assertEquals(0x394, buffer.readCodePointValue()) // decodes a single UTF-8 encoded code point
155+
}
156+
157+
@Test
158+
fun surrogatePairs() {
159+
val buffer = Buffer()
160+
161+
// that's a U+1F31A, a.k.a. "new moon with face"
162+
buffer.writeString("🌚")
163+
// it should be encoded with 4 code units
164+
assertEquals(4, buffer.size)
165+
166+
// let's read it back as a single code point
167+
val moonCodePoint = buffer.readCodePointValue()
168+
// all code units were consumed
169+
assertEquals(0, buffer.size)
170+
171+
// the moon is too wide to fit in a single UTF-16 character!
172+
assertNotEquals(moonCodePoint, moonCodePoint.toChar().code)
173+
// "too wide" means in the [U+010000, U+10FFFF] range
174+
assertTrue(moonCodePoint in 0x10000..0x10FFFF)
175+
176+
// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
177+
val highSurrogate = (0xD800 + (moonCodePoint - 0x10000).ushr(10)).toChar()
178+
val lowSurrogate = (0xDC00 + (moonCodePoint - 0x10000).and(0x3FF)).toChar()
179+
180+
assertContentEquals(charArrayOf(highSurrogate, lowSurrogate), "🌚".toCharArray())
116181
}
117182

118183
@Test

0 commit comments

Comments
 (0)