Skip to content

Commit 8cf9c5d

Browse files
committed
Add writeUtf8Char
1 parent 65a22da commit 8cf9c5d

File tree

3 files changed

+114
-8
lines changed

3 files changed

+114
-8
lines changed

core/commonMain/src/kotlinx/io/text/TextInputObsolete.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,8 @@ private val Utf8StateMachine = intArrayOf(
255255

256256
private const val STATE_FINISH = -2
257257
//private const val Utf8_STATE_ASCII = -1
258-
private const val STATE_UTF_8 = 0
259-
private const val STATE_REJECT = 1
258+
internal const val STATE_UTF_8 = 0
259+
internal const val STATE_REJECT = 1
260260

261261
private inline fun Input.decodeUtf8(consumer: (Int) -> Boolean) {
262262
val stateMachine = Utf8StateMachine

core/commonMain/src/kotlinx/io/text/TextOutputObsolete.kt

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
2727
continue
2828
}
2929

30-
if (textIndex == textEndIndex)
30+
if (textIndex == textEndIndex) {
3131
return@writeBuffer offset
32+
}
3233

3334
// get next character
3435
val character = text[textIndex++]
@@ -41,8 +42,8 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
4142
// fetch next code
4243
val code = when {
4344
character.isHighSurrogate() -> {
44-
if (textIndex == textEndIndex - 1) {
45-
throw MalformedInputException("Splitted surrogate character")
45+
if (textIndex == textEndIndex) {
46+
splittedSurogate()
4647
}
4748
codePoint(character, text[textIndex++])
4849
}
@@ -51,7 +52,7 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
5152

5253
// write Utf8 bytes to buffer or queue them for write in `bytes` if not enough space
5354
when {
54-
code < 0x7ff -> {
55+
code <= 0x7ff -> {
5556
buffer[offset++] = (0xc0 or ((code shr 6) and 0x1f)).toByte()
5657
val byte1 = (code and 0x3f) or 0x80
5758
if (offset < buffer.size) {
@@ -60,7 +61,7 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
6061
bytes = byte1
6162
}
6263
}
63-
code < 0xffff -> {
64+
code <= 0xffff -> {
6465
buffer[offset++] = ((code shr 12) and 0x0f or 0xe0).toByte()
6566
val byte1 = ((code shr 6) and 0x3f) or 0x80
6667
val byte2 = (code and 0x3f) or 0x80
@@ -71,7 +72,7 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
7172
bytes = (byte2 shl 8) or byte1 // order is reversed for writes
7273
}
7374
}
74-
code < 0x10ffff -> {
75+
code <= 0x10ffff -> {
7576
buffer[offset++] = ((code shr 18) and 0x07 or 0xf0).toByte()
7677
val byte1 = ((code shr 12) and 0x3f) or 0x80
7778
val byte2 = ((code shr 6) and 0x3f) or 0x80
@@ -93,6 +94,52 @@ public fun Output.writeUtf8String(text: CharSequence, index: Int = 0, length: In
9394
flush()
9495
}
9596

97+
/**
98+
* Write single UTF-8 [character] to [Output].
99+
*
100+
* @return count of written bytes
101+
* @throws MalformedInputException if [character] is splitted surrogate or invalid.
102+
*/
103+
public fun Output.writeUtf8Char(character: Char): Int {
104+
// ASCII character
105+
if (character <= lastASCII) {
106+
writeByte(character.toByte())
107+
return 1
108+
}
109+
110+
if (character.isHighSurrogate()) {
111+
throw MalformedInputException("Splitted surrogate character: $character")
112+
}
113+
114+
val code = character.toInt()
115+
116+
return when {
117+
code <= 0x7ff -> {
118+
val byte0 = (0xc0 or ((code shr 6) and 0x1f))
119+
val byte1 = (code and 0x3f) or 0x80
120+
writeShort(((byte0 shl 8) or byte1).toShort())
121+
2
122+
}
123+
code <= 0xffff -> {
124+
val byte0 = ((code shr 12) and 0x0f or 0xe0).toByte()
125+
val byte1 = ((code shr 6) and 0x3f) or 0x80
126+
val byte2 = (code and 0x3f) or 0x80
127+
writeByte(byte0)
128+
writeShort(((byte1 shl 8) or byte2).toShort())
129+
3
130+
}
131+
code <= 0x10ffff -> {
132+
val byte0 = ((code shr 18) and 0x07 or 0xf0)
133+
val byte1 = ((code shr 12) and 0x3f) or 0x80
134+
val byte2 = ((code shr 6) and 0x3f) or 0x80
135+
val byte3 = (code and 0x3f) or 0x80
136+
writeInt((byte0 shl 24) or (byte1 shl 16) or (byte2 shl 8) or byte3)
137+
4
138+
}
139+
else -> malformedCodePoint(code)
140+
}
141+
}
142+
96143
internal fun codePoint(high: Char, low: Char): Int {
97144
check(high.isHighSurrogate())
98145
check(low.isLowSurrogate())
@@ -107,3 +154,7 @@ private fun malformedCodePoint(codePoint: Int): Nothing {
107154
// TODO: revise exceptions
108155
throw MalformedInputException("Malformed Utf8 code point $codePoint")
109156
}
157+
158+
internal fun splittedSurogate(): Nothing {
159+
throw MalformedInputException("Splitted surrogate character")
160+
}

core/commonTest/src/kotlinx/io/text/OutputStringTest.kt

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,34 @@ open class OutputStringTest {
6969
assertEquals(expected.contentToString(), read.contentToString())
7070
}
7171

72+
@Test
73+
fun testWriteUtf8Chars() = bufferSizes.forEach { size ->
74+
val text = "file content with unicode : здороваться : 여보세요 : 你好 : ñç."
75+
// @formatter:off
76+
val expected = ubyteArrayOf(
77+
0x66u, 0x69u, 0x6cu, 0x65u, 0x20u, 0x63u, 0x6fu, 0x6eu, 0x74u, 0x65u, 0x6eu, 0x74u, 0x20u,
78+
0x77u, 0x69u, 0x74u, 0x68u, 0x20u, 0x75u, 0x6eu, 0x69u, 0x63u, 0x6fu, 0x64u, 0x65u, 0x20u, // ascii ends
79+
0x20u, 0x3au, 0x20u, 0xd0u, 0xb7u, 0xd0u, 0xb4u, 0xd0u, 0xbeu,
80+
0xd1u, 0x80u, 0xd0u, 0xbeu, 0xd0u, 0xb2u, 0xd0u, 0xb0u, 0xd1u, 0x82u, 0xd1u, 0x8cu, 0xd1u,
81+
0x81u, 0xd1u, 0x8fu, 0x20u, 0x3au, 0x20u, 0xecu, 0x97u, 0xacu, 0xebu, 0xb3u, 0xb4u, 0xecu,
82+
0x84u, 0xb8u, 0xecu, 0x9au, 0x94u, 0x20u, 0x3au, 0x20u, 0xe4u, 0xbdu, 0xa0u,
83+
0xe5u, 0xa5u, 0xbdu, 0x20u, 0x3au, 0x20u, 0xc3u, 0xb1u, 0xc3u, 0xa7u, 0x2eu
84+
)
85+
// @formatter:on
86+
87+
val bytes = buildBytes(size) {
88+
text.forEach { writeUtf8Char(it) }
89+
}
90+
91+
assertEquals(expected.size, bytes.size(), "Size $size")
92+
93+
val input = bytes.input()
94+
val read = UByteArray(expected.size)
95+
input.readByteArray(read)
96+
assertTrue(input.exhausted(), "EOF")
97+
assertEquals(expected.contentToString(), read.contentToString())
98+
}
99+
72100
@Test
73101
fun testWriteMultiByteAtEnd() {
74102
val input = buildBytes {
@@ -140,5 +168,32 @@ open class OutputStringTest {
140168
assertEquals("4444", input.readUtf8Line())
141169
assertTrue(input.exhausted(), "EOF")
142170
}
171+
172+
@Test
173+
fun testWriteSingleUnicode() {
174+
val text = """🤔"""
175+
buildBytes {
176+
writeUtf8String(text, 0, text.length)
177+
}.useInput {
178+
val actual = buildString {
179+
readUtf8LineTo(this)
180+
}
181+
assertEquals(text, actual)
182+
}
183+
}
184+
185+
@Test
186+
fun testParseGlyph() {
187+
val text = """􏿿"""
188+
buildBytes {
189+
writeUtf8String(text, 0, text.length)
190+
}.useInput {
191+
val actual = buildString {
192+
readUtf8LineTo(this)
193+
}
194+
195+
assertEquals(text, actual)
196+
}
197+
}
143198
}
144199

0 commit comments

Comments
 (0)