@@ -319,29 +319,25 @@ impl<'a> Tokenizer<'a> {
319
319
}
320
320
// delimited (quoted) identifier
321
321
quote_start if self . dialect . is_delimited_identifier_start ( quote_start) => {
322
- let mut s = String :: new ( ) ;
323
322
chars. next ( ) ; // consume the opening quote
324
323
let quote_end = SQLWord :: matching_end_quote ( quote_start) ;
325
- while let Some ( ch) = chars. next ( ) {
326
- match ch {
327
- c if c == quote_end => break ,
328
- _ => s. push ( ch) ,
329
- }
324
+ let s = peeking_take_while ( chars, |ch| ch != quote_end) ;
325
+ if chars. next ( ) == Some ( quote_end) {
326
+ Ok ( Some ( Token :: make_word ( & s, Some ( quote_start) ) ) )
327
+ } else {
328
+ Err ( TokenizerError ( format ! (
329
+ "Expected close delimiter '{}' before EOF." ,
330
+ quote_end
331
+ ) ) )
330
332
}
331
- Ok ( Some ( Token :: make_word ( & s, Some ( quote_start) ) ) )
332
333
}
333
334
// numbers
334
335
'0' ..='9' => {
335
- let mut s = String :: new ( ) ;
336
- while let Some ( & ch) = chars. peek ( ) {
337
- match ch {
338
- '0' ..='9' | '.' => {
339
- chars. next ( ) ; // consume
340
- s. push ( ch) ;
341
- }
342
- _ => break ,
343
- }
344
- }
336
+ // TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal
337
+ let s = peeking_take_while ( chars, |ch| match ch {
338
+ '0' ..='9' | '.' => true ,
339
+ _ => false ,
340
+ } ) ;
345
341
Ok ( Some ( Token :: Number ( s) ) )
346
342
}
347
343
// punctuation
@@ -354,22 +350,12 @@ impl<'a> Tokenizer<'a> {
354
350
match chars. peek ( ) {
355
351
Some ( '-' ) => {
356
352
chars. next ( ) ; // consume the second '-', starting a single-line comment
357
- let mut s = String :: new ( ) ;
358
- loop {
359
- match chars. next ( ) {
360
- Some ( ch) if ch != '\n' => {
361
- s. push ( ch) ;
362
- }
363
- other => {
364
- if other. is_some ( ) {
365
- s. push ( '\n' ) ;
366
- }
367
- break Ok ( Some ( Token :: Whitespace (
368
- Whitespace :: SingleLineComment ( s) ,
369
- ) ) ) ;
370
- }
371
- }
353
+ let mut s = peeking_take_while ( chars, |ch| ch != '\n' ) ;
354
+ if let Some ( ch) = chars. next ( ) {
355
+ assert_eq ! ( ch, '\n' ) ;
356
+ s. push ( ch) ;
372
357
}
358
+ Ok ( Some ( Token :: Whitespace ( Whitespace :: SingleLineComment ( s) ) ) )
373
359
}
374
360
// a regular '-' operator
375
361
_ => Ok ( Some ( Token :: Minus ) ) ,
@@ -394,14 +380,8 @@ impl<'a> Tokenizer<'a> {
394
380
'!' => {
395
381
chars. next ( ) ; // consume
396
382
match chars. peek ( ) {
397
- Some ( & ch) => match ch {
398
- '=' => self . consume_and_return ( chars, Token :: Neq ) ,
399
- _ => Err ( TokenizerError ( format ! (
400
- "Tokenizer Error at Line: {}, Col: {}" ,
401
- self . line, self . col
402
- ) ) ) ,
403
- } ,
404
- None => Err ( TokenizerError ( format ! (
383
+ Some ( '=' ) => self . consume_and_return ( chars, Token :: Neq ) ,
384
+ _ => Err ( TokenizerError ( format ! (
405
385
"Tokenizer Error at Line: {}, Col: {}" ,
406
386
self . line, self . col
407
387
) ) ) ,
@@ -410,39 +390,27 @@ impl<'a> Tokenizer<'a> {
410
390
'<' => {
411
391
chars. next ( ) ; // consume
412
392
match chars. peek ( ) {
413
- Some ( & ch) => match ch {
414
- '=' => self . consume_and_return ( chars, Token :: LtEq ) ,
415
- '>' => self . consume_and_return ( chars, Token :: Neq ) ,
416
- _ => Ok ( Some ( Token :: Lt ) ) ,
417
- } ,
418
- None => Ok ( Some ( Token :: Lt ) ) ,
393
+ Some ( '=' ) => self . consume_and_return ( chars, Token :: LtEq ) ,
394
+ Some ( '>' ) => self . consume_and_return ( chars, Token :: Neq ) ,
395
+ _ => Ok ( Some ( Token :: Lt ) ) ,
419
396
}
420
397
}
421
398
'>' => {
422
399
chars. next ( ) ; // consume
423
400
match chars. peek ( ) {
424
- Some ( & ch) => match ch {
425
- '=' => self . consume_and_return ( chars, Token :: GtEq ) ,
426
- _ => Ok ( Some ( Token :: Gt ) ) ,
427
- } ,
428
- None => Ok ( Some ( Token :: Gt ) ) ,
401
+ Some ( '=' ) => self . consume_and_return ( chars, Token :: GtEq ) ,
402
+ _ => Ok ( Some ( Token :: Gt ) ) ,
429
403
}
430
404
}
431
- // colon
432
405
':' => {
433
406
chars. next ( ) ;
434
407
match chars. peek ( ) {
435
- Some ( & ch) => match ch {
436
- // double colon
437
- ':' => self . consume_and_return ( chars, Token :: DoubleColon ) ,
438
- _ => Ok ( Some ( Token :: Colon ) ) ,
439
- } ,
440
- None => Ok ( Some ( Token :: Colon ) ) ,
408
+ Some ( ':' ) => self . consume_and_return ( chars, Token :: DoubleColon ) ,
409
+ _ => Ok ( Some ( Token :: Colon ) ) ,
441
410
}
442
411
}
443
412
';' => self . consume_and_return ( chars, Token :: SemiColon ) ,
444
413
'\\' => self . consume_and_return ( chars, Token :: Backslash ) ,
445
- // brakets
446
414
'[' => self . consume_and_return ( chars, Token :: LBracket ) ,
447
415
']' => self . consume_and_return ( chars, Token :: RBracket ) ,
448
416
'&' => self . consume_and_return ( chars, Token :: Ampersand ) ,
@@ -456,16 +424,10 @@ impl<'a> Tokenizer<'a> {
456
424
457
425
/// Tokenize an identifier or keyword, after the first char is already consumed.
458
426
fn tokenize_word ( & self , first_char : char , chars : & mut Peekable < Chars < ' _ > > ) -> String {
459
- let mut s = String :: new ( ) ;
460
- s. push ( first_char) ;
461
- while let Some ( & ch) = chars. peek ( ) {
462
- if self . dialect . is_identifier_part ( ch) {
463
- chars. next ( ) ; // consume
464
- s. push ( ch) ;
465
- } else {
466
- break ;
467
- }
468
- }
427
+ let mut s = first_char. to_string ( ) ;
428
+ s. push_str ( & peeking_take_while ( chars, |ch| {
429
+ self . dialect . is_identifier_part ( ch)
430
+ } ) ) ;
469
431
s
470
432
}
471
433
@@ -539,6 +501,25 @@ impl<'a> Tokenizer<'a> {
539
501
}
540
502
}
541
503
504
+ /// Read from `chars` until `predicate` returns `false` or EOF is hit.
505
+ /// Return the characters read as String, and keep the first non-matching
506
+ /// char available as `chars.next()`.
507
+ fn peeking_take_while (
508
+ chars : & mut Peekable < Chars < ' _ > > ,
509
+ mut predicate : impl FnMut ( char ) -> bool ,
510
+ ) -> String {
511
+ let mut s = String :: new ( ) ;
512
+ while let Some ( & ch) = chars. peek ( ) {
513
+ if predicate ( ch) {
514
+ chars. next ( ) ; // consume
515
+ s. push ( ch) ;
516
+ } else {
517
+ break ;
518
+ }
519
+ }
520
+ s
521
+ }
522
+
542
523
#[ cfg( test) ]
543
524
mod tests {
544
525
use super :: super :: dialect:: GenericSqlDialect ;
@@ -768,6 +749,20 @@ mod tests {
768
749
compare ( expected, tokens) ;
769
750
}
770
751
752
+ #[ test]
753
+ fn tokenize_mismatched_quotes ( ) {
754
+ let sql = String :: from ( "\" foo" ) ;
755
+
756
+ let dialect = GenericSqlDialect { } ;
757
+ let mut tokenizer = Tokenizer :: new ( & dialect, & sql) ;
758
+ assert_eq ! (
759
+ tokenizer. tokenize( ) ,
760
+ Err ( TokenizerError (
761
+ "Expected close delimiter '\" ' before EOF." . to_string( ) ,
762
+ ) )
763
+ ) ;
764
+ }
765
+
771
766
#[ test]
772
767
fn tokenize_newlines ( ) {
773
768
let sql = String :: from ( "line1\n line2\r line3\r \n line4\r " ) ;
0 commit comments