Skip to content

Commit 6c92c7d

Browse files
committed
internal/export/unicode: add table generator
This is used to update tables in core unicode. Change-Id: I6fb34eba45842e38426b1ca54e79b74c361195ec Reviewed-on: https://go-review.googlesource.com/c/154439 Run-TryBot: Marcel van Lohuizen <[email protected]> Reviewed-by: Brad Fitzpatrick <[email protected]>
1 parent 7319793 commit 6c92c7d

File tree

3 files changed

+56
-141
lines changed

3 files changed

+56
-141
lines changed

gen.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ pkg unicode, var <new script or property> *RangeTable
9797
var unicode = &dependency{}
9898
if updateCore {
9999
fmt.Printf("Updating core to version %s...\n", gen.UnicodeVersion())
100-
unicode = generate("unicode")
100+
unicodeInternal := generate("./internal/export/unicode")
101+
unicode = generate("unicode", unicodeInternal)
101102

102103
// Test some users of the unicode packages, especially the ones that
103104
// keep a mirrored table. These may need to be corrected by hand.

internal/export/unicode/doc.go

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
// Copyright 2018 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Package unicode generates the Unicode tables in core.
6+
package unicode
7+
8+
// This package is defined here, instead of core, as Go does not allow any
9+
// standard packages to have non-standard imports, even if imported in files
10+
// with a build ignore tag.
11+
12+
//go:generate go run gen.go -tables=all -output tables.go
13+
//go:generate mv tables.go $GOROOT/src/unicode

internal/export/unicode/maketables.go renamed to internal/export/unicode/gen.go

+41-140
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import (
2424
"strconv"
2525
"strings"
2626
"unicode"
27+
28+
"golang.org/x/text/unicode/rangetable"
2729
)
2830

2931
func main() {
@@ -450,9 +452,7 @@ const progHeader = `// Copyright 2013 The Go Authors. All rights reserved.
450452
// Use of this source code is governed by a BSD-style
451453
// license that can be found in the LICENSE file.
452454
453-
// Code generated by maketables; DO NOT EDIT.
454-
// To regenerate, run:
455-
// maketables --tables=%s --data=%s --casefolding=%s
455+
// Code generated by go generate; DO NOT EDIT.
456456
457457
package unicode
458458
@@ -504,7 +504,7 @@ func printCategories() {
504504
fullCategoryTest(list)
505505
return
506506
}
507-
printf(progHeader, *tablelist, *dataURL, *casefoldingURL)
507+
printf(progHeader)
508508

509509
println("// Version is the Unicode edition from which the tables are derived.")
510510
printf("const Version = %q\n\n", version())
@@ -596,91 +596,38 @@ func printCategories() {
596596

597597
type Op func(code rune) bool
598598

599-
const format = "\t\t{0x%04x, 0x%04x, %d},\n"
600-
601599
func dumpRange(header string, inCategory Op) {
602-
print(header)
603-
next := rune(0)
604-
latinOffset := 0
605-
print("\tR16: []Range16{\n")
606-
// one Range for each iteration
607-
count := &range16Count
608-
size := 16
609-
for {
610-
// look for start of range
611-
for next < rune(len(chars)) && !inCategory(next) {
612-
next++
613-
}
614-
if next >= rune(len(chars)) {
615-
// no characters remain
616-
break
617-
}
618-
619-
// start of range
620-
lo := next
621-
hi := next
622-
stride := rune(1)
623-
// accept lo
624-
next++
625-
// look for another character to set the stride
626-
for next < rune(len(chars)) && !inCategory(next) {
627-
next++
628-
}
629-
if next >= rune(len(chars)) {
630-
// no more characters
631-
printf(format, lo, hi, stride)
632-
break
633-
}
634-
// set stride
635-
stride = next - lo
636-
// check for length of run. next points to first jump in stride
637-
for i := next; i < rune(len(chars)); i++ {
638-
if inCategory(i) == (((i - lo) % stride) == 0) {
639-
// accept
640-
if inCategory(i) {
641-
hi = i
642-
}
643-
} else {
644-
// no more characters in this run
645-
break
646-
}
647-
}
648-
if uint32(hi) <= unicode.MaxLatin1 {
649-
latinOffset++
600+
runes := []rune{}
601+
for i := range chars {
602+
r := rune(i)
603+
if inCategory(r) {
604+
runes = append(runes, r)
650605
}
651-
size, count = printRange(uint32(lo), uint32(hi), uint32(stride), size, count)
652-
// next range: start looking where this range ends
653-
next = hi + 1
654-
}
655-
print("\t},\n")
656-
if latinOffset > 0 {
657-
printf("\tLatinOffset: %d,\n", latinOffset)
658606
}
659-
print("}\n\n")
607+
printRangeTable(header, runes)
660608
}
661609

662-
func printRange(lo, hi, stride uint32, size int, count *int) (int, *int) {
663-
if size == 16 && hi >= 1<<16 {
664-
if lo < 1<<16 {
665-
if lo+stride != hi {
666-
logger.Fatalf("unexpected straddle: %U %U %d", lo, hi, stride)
667-
}
668-
// No range contains U+FFFF as an instance, so split
669-
// the range into two entries. That way we can maintain
670-
// the invariant that R32 contains only >= 1<<16.
671-
printf(format, lo, lo, 1)
672-
lo = hi
673-
stride = 1
674-
*count++
610+
func printRangeTable(header string, runes []rune) {
611+
rt := rangetable.New(runes...)
612+
print(header)
613+
println("\tR16: []Range16{")
614+
for _, r := range rt.R16 {
615+
printf("\t\t{%#04x, %#04x, %d},\n", r.Lo, r.Hi, r.Stride)
616+
range16Count++
617+
}
618+
println("\t},")
619+
if len(rt.R32) > 0 {
620+
println("\tR32: []Range32{")
621+
for _, r := range rt.R32 {
622+
printf("\t\t{%#x, %#x, %d},\n", r.Lo, r.Hi, r.Stride)
623+
range32Count++
675624
}
676-
print("\t},\n")
677-
print("\tR32: []Range32{\n")
678-
size = 32
679-
count = &range32Count
680-
}
681-
printf(format, lo, hi, stride)
682-
*count++
683-
return size, count
625+
println("\t},")
626+
}
627+
if rt.LatinOffset > 0 {
628+
printf("\tLatinOffset: %d,\n", rt.LatinOffset)
629+
}
630+
printf("}\n\n")
684631
}
685632

686633
func fullCategoryTest(list []string) {
@@ -751,26 +698,6 @@ func parseScript(line string, scripts map[string][]Script) {
751698
scripts[name] = append(scripts[name], Script{uint32(lo), uint32(hi), name})
752699
}
753700

754-
// The script tables have a lot of adjacent elements. Fold them together.
755-
func foldAdjacent(r []Script) []unicode.Range32 {
756-
s := make([]unicode.Range32, 0, len(r))
757-
j := 0
758-
for i := 0; i < len(r); i++ {
759-
if j > 0 && r[i].lo == s[j-1].Hi+1 {
760-
s[j-1].Hi = r[i].hi
761-
} else {
762-
s = s[0 : j+1]
763-
s[j] = unicode.Range32{
764-
Lo: uint32(r[i].lo),
765-
Hi: uint32(r[i].hi),
766-
Stride: 1,
767-
}
768-
j++
769-
}
770-
}
771-
return s
772-
}
773-
774701
func fullScriptTest(list []string, installed map[string]*unicode.RangeTable, scripts map[string][]Script) {
775702
for _, name := range list {
776703
if _, ok := scripts[name]; !ok {
@@ -796,13 +723,11 @@ var deprecatedAliases = map[string]string{
796723

797724
// PropList.txt has the same format as Scripts.txt so we can share its parser.
798725
func printScriptOrProperty(doProps bool) {
799-
flag := "scripts"
800726
flaglist := *scriptlist
801727
file := "Scripts.txt"
802728
table := scripts
803729
installed := unicode.Scripts
804730
if doProps {
805-
flag = "props"
806731
flaglist = *proplist
807732
file = "PropList.txt"
808733
table = props
@@ -831,13 +756,6 @@ func printScriptOrProperty(doProps bool) {
831756
return
832757
}
833758

834-
printf(
835-
"// Generated by running\n"+
836-
"// maketables --%s=%s --url=%s\n"+
837-
"// DO NOT EDIT\n\n",
838-
flag,
839-
flaglist,
840-
*url)
841759
if flaglist == "all" {
842760
if doProps {
843761
println("// Properties is the set of Unicode property tables.")
@@ -874,19 +792,14 @@ func printScriptOrProperty(doProps bool) {
874792
alias, name)
875793
ndecl++
876794
}
877-
printf("var _%s = &RangeTable {\n", name)
878-
ranges := foldAdjacent(table[name])
879-
print("\tR16: []Range16{\n")
880-
size := 16
881-
count := &range16Count
882-
for _, s := range ranges {
883-
size, count = printRange(s.Lo, s.Hi, s.Stride, size, count)
884-
}
885-
print("\t},\n")
886-
if off := findLatinOffset(ranges); off > 0 {
887-
printf("\tLatinOffset: %d,\n", off)
795+
decl := fmt.Sprintf("var _%s = &RangeTable {\n", name)
796+
runes := []rune{}
797+
for _, scr := range table[name] {
798+
for r := scr.lo; r <= scr.hi; r++ {
799+
runes = append(runes, rune(r))
800+
}
888801
}
889-
print("}\n\n")
802+
printRangeTable(decl, runes)
890803
}
891804
decl.Sort()
892805
println("// These variables have type *RangeTable.")
@@ -897,14 +810,6 @@ func printScriptOrProperty(doProps bool) {
897810
print(")\n\n")
898811
}
899812

900-
func findLatinOffset(ranges []unicode.Range32) int {
901-
i := 0
902-
for i < len(ranges) && ranges[i].Hi <= unicode.MaxLatin1 {
903-
i++
904-
}
905-
return i
906-
}
907-
908813
const (
909814
CaseUpper = 1 << iota
910815
CaseLower
@@ -1054,14 +959,10 @@ func printCases() {
1054959
return
1055960
}
1056961
printf(
1057-
"// Generated by running\n"+
1058-
"// maketables --data=%s --casefolding=%s\n"+
1059-
"// DO NOT EDIT\n\n"+
1060-
"// CaseRanges is the table describing case mappings for all letters with\n"+
1061-
"// non-self mappings.\n"+
1062-
"var CaseRanges = _CaseRanges\n"+
1063-
"var _CaseRanges = []CaseRange {\n",
1064-
*dataURL, *casefoldingURL)
962+
"// CaseRanges is the table describing case mappings for all letters with\n" +
963+
"// non-self mappings.\n" +
964+
"var CaseRanges = _CaseRanges\n" +
965+
"var _CaseRanges = []CaseRange {\n")
1065966

1066967
var startState *caseState // the start of a run; nil for not active
1067968
var prevState = &caseState{} // the state of the previous character

0 commit comments

Comments
 (0)