Skip to content

Commit 2295061

Browse files
alessandroasmBurntSushi
authored andcommitted
searcher: do UTF-8 BOM sniffing like UTF-16
Previously, we were only looking for the UTF-16 BOM for determining whether to do transcoding or not. But we should also look for the UTF-8 BOM as well. Fixes #1638, Closes #1697
1 parent 53c4855 commit 2295061

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ Bug fixes:
5454
Fix stdin detection when using PowerShell in UNIX environments.
5555
* [BUG #1765](https://github.com/BurntSushi/ripgrep/issues/1765):
5656
Fix panic when `--crlf` is used in some cases.
57+
* [BUG #1638](https://github.com/BurntSushi/ripgrep/issues/1638):
58+
Correctly sniff UTF-8 and do transcoding, like we do for UTF-16.
5759
* [BUG #1816](https://github.com/BurntSushi/ripgrep/issues/1816):
5860
Add documentation for glob alternate syntax, e.g., `{a,b,..}`.
5961
* [BUG #1847](https://github.com/BurntSushi/ripgrep/issues/1847):

crates/searcher/src/searcher/mod.rs

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -788,7 +788,7 @@ impl Searcher {
788788
/// Returns true if and only if the given slice needs to be transcoded.
789789
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
790790
self.config.encoding.is_some()
791-
|| (self.config.bom_sniffing && slice_has_utf16_bom(slice))
791+
|| (self.config.bom_sniffing && slice_has_bom(slice))
792792
}
793793
}
794794

@@ -973,16 +973,18 @@ impl Searcher {
973973
}
974974
}
975975

976-
/// Returns true if and only if the given slice begins with a UTF-16 BOM.
976+
/// Returns true if and only if the given slice begins with a UTF-8 or UTF-16
977+
/// BOM.
977978
///
978979
/// This is used by the searcher to determine if a transcoder is necessary.
979980
/// Otherwise, it is advantageous to search the slice directly.
980-
fn slice_has_utf16_bom(slice: &[u8]) -> bool {
981+
fn slice_has_bom(slice: &[u8]) -> bool {
981982
let enc = match encoding_rs::Encoding::for_bom(slice) {
982983
None => return false,
983984
Some((enc, _)) => enc,
984985
};
985-
[encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc)
986+
[encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8]
987+
.contains(&enc)
986988
}
987989

988990
#[cfg(test)]
@@ -1009,4 +1011,21 @@ mod tests {
10091011
let res = searcher.search_slice(matcher, &[], sink);
10101012
assert!(res.is_err());
10111013
}
1014+
1015+
#[test]
1016+
fn uft8_bom_sniffing() {
1017+
// See: https://github.com/BurntSushi/ripgrep/issues/1638
1018+
// ripgrep must sniff utf-8 BOM, just like it does with utf-16
1019+
let matcher = RegexMatcher::new("foo");
1020+
let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f];
1021+
1022+
let mut sink = KitchenSink::new();
1023+
let mut searcher = SearcherBuilder::new().build();
1024+
1025+
let res = searcher.search_slice(matcher, haystack, &mut sink);
1026+
assert!(res.is_ok());
1027+
1028+
let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
1029+
assert_eq!(sink_output, "1:0:foo\nbyte count:3\n");
1030+
}
10121031
}

tests/regression.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -867,6 +867,15 @@ use B;
867867
eqnice!("2\n", cmd.stdout());
868868
});
869869

870+
// See: https://github.com/BurntSushi/ripgrep/issues/1638
871+
//
872+
// Tests if UTF-8 BOM is sniffed, then the column index is correct.
873+
rgtest!(r1638, |dir: Dir, mut cmd: TestCommand| {
874+
dir.create_bytes("foo", b"\xef\xbb\xbfx");
875+
876+
eqnice!("foo:1:1:x\n", cmd.arg("--column").arg("x").stdout());
877+
});
878+
870879
// See: https://github.com/BurntSushi/ripgrep/issues/1765
871880
rgtest!(r1765, |dir: Dir, mut cmd: TestCommand| {
872881
dir.create("test", "\n");

0 commit comments

Comments
 (0)