Skip to content

Commit a76ec96

Browse files
committed
Reimplement UTF-8-related functions using Unsafe API
1 parent cfb3bfa commit a76ec96

File tree

1 file changed

+78
-68
lines changed

1 file changed

+78
-68
lines changed

core/common/src/Utf8.kt

Lines changed: 78 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@
7070
package kotlinx.io
7171

7272
import kotlinx.io.internal.*
73+
import kotlinx.io.unsafe.UnsafeBufferOperations
74+
import kotlinx.io.unsafe.withData
75+
import kotlin.math.min
7376

7477
/**
7578
* Returns the number of bytes used to encode the slice of `string` as UTF-8 when using [Sink.writeString].
@@ -457,6 +460,7 @@ private fun Buffer.commonReadUtf8CodePoint(): Int {
457460
}
458461
}
459462

463+
@OptIn(UnsafeIoApi::class)
460464
private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt: (Int) -> Char) {
461465
// Transcode a UTF-16 chars to UTF-8 bytes.
462466
var i = beginIndex
@@ -465,45 +469,49 @@ private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt
465469

466470
when {
467471
c < 0x80 -> {
468-
val tail = writableSegment(1)
469-
val data = tail.data
470-
val segmentOffset = tail.limit - i
471-
val runLimit = minOf(endIndex, Segment.SIZE - segmentOffset)
472-
473-
// Emit a 7-bit character with 1 byte.
474-
data[segmentOffset + i++] = c.toByte() // 0xxxxxxx
475-
476-
// Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance
477-
// improvement over independent calls to writeByte().
478-
while (i < runLimit) {
479-
c = charAt(i).code
480-
if (c >= 0x80) break
481-
data[segmentOffset + i++] = c.toByte() // 0xxxxxxx
472+
UnsafeBufferOperations.writeToTail(this, 1) { ctx, segment ->
473+
val segmentOffset = -i
474+
val runLimit = minOf(endIndex, i + segment.remainingCapacity)
475+
476+
// Emit a 7-bit character with 1 byte.
477+
ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx
478+
479+
// Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance
480+
// improvement over independent calls to writeByte().
481+
while (i < runLimit) {
482+
c = charAt(i).code
483+
if (c >= 0x80) break
484+
ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx
485+
}
486+
487+
i + segmentOffset // Equivalent to i - (previous i).
482488
}
483-
484-
val runSize = i + segmentOffset - tail.limit // Equivalent to i - (previous i).
485-
tail.limit += runSize
486-
sizeMut += runSize.toLong()
487489
}
488490

489491
c < 0x800 -> {
490492
// Emit a 11-bit character with 2 bytes.
491-
val tail = writableSegment(2)
492-
tail.data[tail.limit] = (c shr 6 or 0xc0).toByte() // 110xxxxx
493-
tail.data[tail.limit + 1] = (c and 0x3f or 0x80).toByte() // 10xxxxxx
494-
tail.limit += 2
495-
sizeMut += 2L
493+
UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment ->
494+
ctx.setUnchecked(
495+
segment, 0,
496+
(c shr 6 or 0xc0).toByte(), // 110xxxxx
497+
(c and 0x3f or 0x80).toByte() // 10xxxxxx
498+
)
499+
2
500+
}
496501
i++
497502
}
498503

499504
c < 0xd800 || c > 0xdfff -> {
500505
// Emit a 16-bit character with 3 bytes.
501-
val tail = writableSegment(3)
502-
tail.data[tail.limit] = (c shr 12 or 0xe0).toByte() // 1110xxxx
503-
tail.data[tail.limit + 1] = (c shr 6 and 0x3f or 0x80).toByte() // 10xxxxxx
504-
tail.data[tail.limit + 2] = (c and 0x3f or 0x80).toByte() // 10xxxxxx
505-
tail.limit += 3
506-
sizeMut += 3L
506+
UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment ->
507+
ctx.setUnchecked(
508+
segment, 0,
509+
(c shr 12 or 0xe0).toByte(), // 1110xxxx
510+
(c shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx
511+
(c and 0x3f or 0x80).toByte() // 10xxxxxx
512+
)
513+
3
514+
}
507515
i++
508516
}
509517

@@ -522,20 +530,23 @@ private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt
522530
val codePoint = 0x010000 + (c and 0x03ff shl 10 or (low and 0x03ff))
523531

524532
// Emit a 21-bit character with 4 bytes.
525-
val tail = writableSegment(4)
526-
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
527-
tail.data[tail.limit + 1] = (codePoint shr 12 and 0x3f or 0x80).toByte() // 10xxxxxx
528-
tail.data[tail.limit + 2] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxyyyy
529-
tail.data[tail.limit + 3] = (codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
530-
tail.limit += 4
531-
sizeMut += 4L
533+
UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment ->
534+
ctx.setUnchecked(segment, 0,
535+
(codePoint shr 18 or 0xf0).toByte(), // 11110xxx
536+
(codePoint shr 12 and 0x3f or 0x80).toByte(), // 10xxxxxx
537+
(codePoint shr 6 and 0x3f or 0x80).toByte(), // 10xxyyyy
538+
(codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
539+
)
540+
4
541+
}
532542
i += 2
533543
}
534544
}
535545
}
536546
}
537547
}
538548

549+
@OptIn(UnsafeIoApi::class)
539550
private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
540551
when {
541552
codePoint < 0 || codePoint > 0x10ffff -> {
@@ -551,11 +562,11 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
551562

552563
codePoint < 0x800 -> {
553564
// Emit a 11-bit code point with 2 bytes.
554-
val tail = writableSegment(2)
555-
tail.data[tail.limit] = (codePoint shr 6 or 0xc0).toByte() // 110xxxxx
556-
tail.data[tail.limit + 1] = (codePoint and 0x3f or 0x80).toByte() // 10xxxxxx
557-
tail.limit += 2
558-
sizeMut += 2L
565+
UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment ->
566+
ctx.setUnchecked(segment, 0, (codePoint shr 6 or 0xc0).toByte()) // 110xxxxx
567+
ctx.setUnchecked(segment, 1, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx
568+
2
569+
}
559570
}
560571

561572
codePoint in 0xd800..0xdfff -> {
@@ -565,48 +576,47 @@ private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) {
565576

566577
codePoint < 0x10000 -> {
567578
// Emit a 16-bit code point with 3 bytes.
568-
val tail = writableSegment(3)
569-
tail.data[tail.limit] = (codePoint shr 12 or 0xe0).toByte() // 1110xxxx
570-
tail.data[tail.limit + 1] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxxxxx
571-
tail.data[tail.limit + 2] = (codePoint and 0x3f or 0x80).toByte() // 10xxxxxx
572-
tail.limit += 3
573-
sizeMut += 3L
579+
UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment ->
580+
ctx.setUnchecked(segment, 0, (codePoint shr 12 or 0xe0).toByte()) // 1110xxxx
581+
ctx.setUnchecked(segment, 1, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
582+
ctx.setUnchecked(segment, 2, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx
583+
3
584+
}
574585
}
575586

576587
else -> { // [0x10000, 0x10ffff]
577588
// Emit a 21-bit code point with 4 bytes.
578-
val tail = writableSegment(4)
579-
tail.data[tail.limit] = (codePoint shr 18 or 0xf0).toByte() // 11110xxx
580-
tail.data[tail.limit + 1] = (codePoint shr 12 and 0x3f or 0x80).toByte() // 10xxxxxx
581-
tail.data[tail.limit + 2] = (codePoint shr 6 and 0x3f or 0x80).toByte() // 10xxyyyy
582-
tail.data[tail.limit + 3] = (codePoint and 0x3f or 0x80).toByte() // 10yyyyyy
583-
tail.limit += 4
584-
sizeMut += 4L
589+
UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment ->
590+
ctx.setUnchecked(segment,0, (codePoint shr 18 or 0xf0).toByte()) // 11110xxx
591+
ctx.setUnchecked(segment,1, (codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
592+
ctx.setUnchecked(segment,2, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
593+
ctx.setUnchecked(segment,3, (codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
594+
4
595+
}
585596
}
586597
}
587598
}
588599

600+
@OptIn(UnsafeIoApi::class)
589601
private fun Buffer.commonReadUtf8(byteCount: Long): String {
590602
require(byteCount >= 0 && byteCount <= Int.MAX_VALUE) {
591603
"byteCount ($byteCount) is not within the range [0..${Int.MAX_VALUE})"
592604
}
593605
require(byteCount)
594606
if (byteCount == 0L) return ""
595607

596-
val s = head!!
597-
if (s.pos + byteCount > s.limit) {
598-
// If the string spans multiple segments, delegate to readBytes().
599-
600-
return readByteArray(byteCount.toInt()).commonToUtf8String()
601-
}
602-
603-
val result = s.data.commonToUtf8String(s.pos, s.pos + byteCount.toInt())
604-
s.pos += byteCount.toInt()
605-
sizeMut -= byteCount
606-
607-
if (s.pos == s.limit) {
608-
recycleHead()
608+
UnsafeBufferOperations.iterate(this) { ctx, head ->
609+
head!!
610+
if (head.size >= byteCount) {
611+
var result = ""
612+
ctx.withData(head) { data, pos, limit ->
613+
result = data.commonToUtf8String(pos, min(limit, pos + byteCount.toInt()))
614+
skip(byteCount)
615+
return result
616+
}
617+
}
609618
}
610619

611-
return result
620+
// If the string spans multiple segments, delegate to readBytes().
621+
return readByteArray(byteCount.toInt()).commonToUtf8String()
612622
}

0 commit comments

Comments
 (0)