@@ -7,56 +7,72 @@ use syntax::{TextRange, TextSize};
7
7
8
8
#[ derive( Clone , Debug , PartialEq , Eq ) ]
9
9
pub struct LineIndex {
10
- /// Offset the the beginning of each line, zero-based
10
+ /// Offset the beginning of each line, zero-based.
11
11
pub ( crate ) newlines : Vec < TextSize > ,
12
- /// List of non-ASCII characters on each line
13
- pub ( crate ) utf16_lines : NoHashHashMap < u32 , Vec < Utf16Char > > ,
12
+ /// List of non-ASCII characters on each line.
13
+ pub ( crate ) line_wide_chars : NoHashHashMap < u32 , Vec < WideChar > > ,
14
14
}
15
15
16
+ /// Line/Column information in native, utf8 format.
16
17
#[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
17
- pub struct LineColUtf16 {
18
+ pub struct LineCol {
18
19
/// Zero-based
19
20
pub line : u32 ,
20
- /// Zero-based
21
+ /// Zero-based utf8 offset
21
22
pub col : u32 ,
22
23
}
23
24
24
25
#[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
25
- pub struct LineCol {
26
+ pub enum WideEncoding {
27
+ Utf16 ,
28
+ Utf32 ,
29
+ }
30
+
31
+ /// Line/Column information in legacy encodings.
32
+ ///
33
+ /// Deliberately not a generic type and different from `LineCol`.
34
+ #[ derive( Clone , Copy , Debug , PartialEq , Eq , Hash ) ]
35
+ pub struct WideLineCol {
26
36
/// Zero-based
27
37
pub line : u32 ,
28
- /// Zero-based utf8 offset
38
+ /// Zero-based
29
39
pub col : u32 ,
30
40
}
31
41
32
42
#[ derive( Clone , Debug , Hash , PartialEq , Eq ) ]
33
- pub ( crate ) struct Utf16Char {
43
+ pub ( crate ) struct WideChar {
34
44
/// Start offset of a character inside a line, zero-based
35
45
pub ( crate ) start : TextSize ,
36
46
/// End offset of a character inside a line, zero-based
37
47
pub ( crate ) end : TextSize ,
38
48
}
39
49
40
- impl Utf16Char {
50
+ impl WideChar {
41
51
/// Returns the length in 8-bit UTF-8 code units.
42
52
fn len ( & self ) -> TextSize {
43
53
self . end - self . start
44
54
}
45
55
46
- /// Returns the length in 16-bit UTF-16 code units.
47
- fn len_utf16 ( & self ) -> usize {
48
- if self . len ( ) == TextSize :: from ( 4 ) {
49
- 2
50
- } else {
51
- 1
56
+ /// Returns the length in UTF-16 or UTF-32 code units.
57
+ fn wide_len ( & self , enc : WideEncoding ) -> usize {
58
+ match enc {
59
+ WideEncoding :: Utf16 => {
60
+ if self . len ( ) == TextSize :: from ( 4 ) {
61
+ 2
62
+ } else {
63
+ 1
64
+ }
65
+ }
66
+
67
+ WideEncoding :: Utf32 => 1 ,
52
68
}
53
69
}
54
70
}
55
71
56
72
impl LineIndex {
57
73
pub fn new ( text : & str ) -> LineIndex {
58
- let mut utf16_lines = NoHashHashMap :: default ( ) ;
59
- let mut utf16_chars = Vec :: new ( ) ;
74
+ let mut line_wide_chars = NoHashHashMap :: default ( ) ;
75
+ let mut wide_chars = Vec :: new ( ) ;
60
76
61
77
let mut newlines = Vec :: with_capacity ( 16 ) ;
62
78
newlines. push ( TextSize :: from ( 0 ) ) ;
@@ -71,8 +87,8 @@ impl LineIndex {
71
87
newlines. push ( curr_row) ;
72
88
73
89
// Save any utf-16 characters seen in the previous line
74
- if !utf16_chars . is_empty ( ) {
75
- utf16_lines . insert ( line, mem:: take ( & mut utf16_chars ) ) ;
90
+ if !wide_chars . is_empty ( ) {
91
+ line_wide_chars . insert ( line, mem:: take ( & mut wide_chars ) ) ;
76
92
}
77
93
78
94
// Prepare for processing the next line
@@ -82,18 +98,18 @@ impl LineIndex {
82
98
}
83
99
84
100
if !c. is_ascii ( ) {
85
- utf16_chars . push ( Utf16Char { start : curr_col, end : curr_col + c_len } ) ;
101
+ wide_chars . push ( WideChar { start : curr_col, end : curr_col + c_len } ) ;
86
102
}
87
103
88
104
curr_col += c_len;
89
105
}
90
106
91
107
// Save any utf-16 characters seen in the last line
92
- if !utf16_chars . is_empty ( ) {
93
- utf16_lines . insert ( line, utf16_chars ) ;
108
+ if !wide_chars . is_empty ( ) {
109
+ line_wide_chars . insert ( line, wide_chars ) ;
94
110
}
95
111
96
- LineIndex { newlines, utf16_lines }
112
+ LineIndex { newlines, line_wide_chars }
97
113
}
98
114
99
115
pub fn line_col ( & self , offset : TextSize ) -> LineCol {
@@ -109,13 +125,13 @@ impl LineIndex {
109
125
. map ( |offset| offset + TextSize :: from ( line_col. col ) )
110
126
}
111
127
112
- pub fn to_utf16 ( & self , line_col : LineCol ) -> LineColUtf16 {
113
- let col = self . utf8_to_utf16_col ( line_col. line , line_col. col . into ( ) ) ;
114
- LineColUtf16 { line : line_col. line , col : col as u32 }
128
+ pub fn to_wide ( & self , enc : WideEncoding , line_col : LineCol ) -> WideLineCol {
129
+ let col = self . utf8_to_wide_col ( enc , line_col. line , line_col. col . into ( ) ) ;
130
+ WideLineCol { line : line_col. line , col : col as u32 }
115
131
}
116
132
117
- pub fn to_utf8 ( & self , line_col : LineColUtf16 ) -> LineCol {
118
- let col = self . utf16_to_utf8_col ( line_col. line , line_col. col ) ;
133
+ pub fn to_utf8 ( & self , enc : WideEncoding , line_col : WideLineCol ) -> LineCol {
134
+ let col = self . wide_to_utf8_col ( enc , line_col. line , line_col. col ) ;
119
135
LineCol { line : line_col. line , col : col. into ( ) }
120
136
}
121
137
@@ -132,12 +148,12 @@ impl LineIndex {
132
148
. filter ( |it| !it. is_empty ( ) )
133
149
}
134
150
135
- fn utf8_to_utf16_col ( & self , line : u32 , col : TextSize ) -> usize {
151
+ fn utf8_to_wide_col ( & self , enc : WideEncoding , line : u32 , col : TextSize ) -> usize {
136
152
let mut res: usize = col. into ( ) ;
137
- if let Some ( utf16_chars ) = self . utf16_lines . get ( & line) {
138
- for c in utf16_chars {
153
+ if let Some ( wide_chars ) = self . line_wide_chars . get ( & line) {
154
+ for c in wide_chars {
139
155
if c. end <= col {
140
- res -= usize:: from ( c. len ( ) ) - c. len_utf16 ( ) ;
156
+ res -= usize:: from ( c. len ( ) ) - c. wide_len ( enc ) ;
141
157
} else {
142
158
// From here on, all utf16 characters come *after* the character we are mapping,
143
159
// so we don't need to take them into account
@@ -148,11 +164,11 @@ impl LineIndex {
148
164
res
149
165
}
150
166
151
- fn utf16_to_utf8_col ( & self , line : u32 , mut col : u32 ) -> TextSize {
152
- if let Some ( utf16_chars ) = self . utf16_lines . get ( & line) {
153
- for c in utf16_chars {
167
+ fn wide_to_utf8_col ( & self , enc : WideEncoding , line : u32 , mut col : u32 ) -> TextSize {
168
+ if let Some ( wide_chars ) = self . line_wide_chars . get ( & line) {
169
+ for c in wide_chars {
154
170
if col > u32:: from ( c. start ) {
155
- col += u32:: from ( c. len ( ) ) - c. len_utf16 ( ) as u32 ;
171
+ col += u32:: from ( c. len ( ) ) - c. wide_len ( enc ) as u32 ;
156
172
} else {
157
173
// From here on, all utf16 characters come *after* the character we are mapping,
158
174
// so we don't need to take them into account
@@ -167,6 +183,9 @@ impl LineIndex {
167
183
168
184
#[ cfg( test) ]
169
185
mod tests {
186
+ use test_utils:: skip_slow_tests;
187
+
188
+ use super :: WideEncoding :: { Utf16 , Utf32 } ;
170
189
use super :: * ;
171
190
172
191
#[ test]
@@ -210,67 +229,59 @@ mod tests {
210
229
const C: char = 'x';
211
230
" ,
212
231
) ;
213
- assert_eq ! ( col_index. utf16_lines . len( ) , 0 ) ;
232
+ assert_eq ! ( col_index. line_wide_chars . len( ) , 0 ) ;
214
233
}
215
234
216
235
#[ test]
217
- fn test_single_char ( ) {
218
- let col_index = LineIndex :: new (
219
- "
220
- const C: char = 'メ';
221
- " ,
222
- ) ;
223
-
224
- assert_eq ! ( col_index. utf16_lines. len( ) , 1 ) ;
225
- assert_eq ! ( col_index. utf16_lines[ & 1 ] . len( ) , 1 ) ;
226
- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 0 ] , Utf16Char { start: 17 . into( ) , end: 20 . into( ) } ) ;
227
-
228
- // UTF-8 to UTF-16, no changes
229
- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 15 . into( ) ) , 15 ) ;
230
-
231
- // UTF-8 to UTF-16
232
- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 22 . into( ) ) , 20 ) ;
233
-
234
- // UTF-16 to UTF-8, no changes
235
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 15 ) , TextSize :: from( 15 ) ) ;
236
-
237
- // UTF-16 to UTF-8
238
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 19 ) , TextSize :: from( 21 ) ) ;
239
-
240
- let col_index = LineIndex :: new ( "a𐐏b" ) ;
241
- assert_eq ! ( col_index. utf16_to_utf8_col( 0 , 3 ) , TextSize :: from( 5 ) ) ;
242
- }
243
-
244
- #[ test]
245
- fn test_string ( ) {
246
- let col_index = LineIndex :: new (
247
- "
248
- const C: char = \" メ メ\" ;
249
- " ,
250
- ) ;
251
-
252
- assert_eq ! ( col_index. utf16_lines. len( ) , 1 ) ;
253
- assert_eq ! ( col_index. utf16_lines[ & 1 ] . len( ) , 2 ) ;
254
- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 0 ] , Utf16Char { start: 17 . into( ) , end: 20 . into( ) } ) ;
255
- assert_eq ! ( col_index. utf16_lines[ & 1 ] [ 1 ] , Utf16Char { start: 21 . into( ) , end: 24 . into( ) } ) ;
256
-
257
- // UTF-8 to UTF-16
258
- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 15 . into( ) ) , 15 ) ;
259
-
260
- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 21 . into( ) ) , 19 ) ;
261
- assert_eq ! ( col_index. utf8_to_utf16_col( 1 , 25 . into( ) ) , 21 ) ;
262
-
263
- assert ! ( col_index. utf8_to_utf16_col( 2 , 15 . into( ) ) == 15 ) ;
264
-
265
- // UTF-16 to UTF-8
266
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 15 ) , TextSize :: from( 15 ) ) ;
236
+ fn test_every_chars ( ) {
237
+ if skip_slow_tests ( ) {
238
+ return ;
239
+ }
267
240
268
- // メ UTF-8: 0xE3 0x83 0xA1, UTF-16: 0x30E1
269
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 17 ) , TextSize :: from( 17 ) ) ; // first メ at 17..20
270
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 18 ) , TextSize :: from( 20 ) ) ; // space
271
- assert_eq ! ( col_index. utf16_to_utf8_col( 1 , 19 ) , TextSize :: from( 21 ) ) ; // second メ at 21..24
241
+ let text: String = {
242
+ let mut chars: Vec < char > = ( ( 0 as char ) ..char:: MAX ) . collect ( ) ; // Neat!
243
+ chars. extend ( "\n " . repeat ( chars. len ( ) / 16 ) . chars ( ) ) ;
244
+ let mut rng = oorandom:: Rand32 :: new ( stdx:: rand:: seed ( ) ) ;
245
+ stdx:: rand:: shuffle ( & mut chars, |i| rng. rand_range ( 0 ..i as u32 ) as usize ) ;
246
+ chars. into_iter ( ) . collect ( )
247
+ } ;
248
+ assert ! ( text. contains( '💩' ) ) ; // Sanity check.
249
+
250
+ let line_index = LineIndex :: new ( & text) ;
251
+
252
+ let mut lin_col = LineCol { line : 0 , col : 0 } ;
253
+ let mut col_utf16 = 0 ;
254
+ let mut col_utf32 = 0 ;
255
+ for ( offset, c) in text. char_indices ( ) {
256
+ let got_offset = line_index. offset ( lin_col) . unwrap ( ) ;
257
+ assert_eq ! ( usize :: from( got_offset) , offset) ;
258
+
259
+ let got_lin_col = line_index. line_col ( got_offset) ;
260
+ assert_eq ! ( got_lin_col, lin_col) ;
261
+
262
+ for enc in [ Utf16 , Utf32 ] {
263
+ let wide_lin_col = line_index. to_wide ( enc, lin_col) ;
264
+ let got_lin_col = line_index. to_utf8 ( enc, wide_lin_col) ;
265
+ assert_eq ! ( got_lin_col, lin_col) ;
266
+
267
+ let want_col = match enc {
268
+ Utf16 => col_utf16,
269
+ Utf32 => col_utf32,
270
+ } ;
271
+ assert_eq ! ( wide_lin_col. col, want_col)
272
+ }
272
273
273
- assert_eq ! ( col_index. utf16_to_utf8_col( 2 , 15 ) , TextSize :: from( 15 ) ) ;
274
+ if c == '\n' {
275
+ lin_col. line += 1 ;
276
+ lin_col. col = 0 ;
277
+ col_utf16 = 0 ;
278
+ col_utf32 = 0 ;
279
+ } else {
280
+ lin_col. col += c. len_utf8 ( ) as u32 ;
281
+ col_utf16 += c. len_utf16 ( ) as u32 ;
282
+ col_utf32 += 1 ;
283
+ }
284
+ }
274
285
}
275
286
276
287
#[ test]
0 commit comments