Skip to content

Commit 900f186

Browse files
Fix encoding detection and exception on empty files (#2195)
The encoding detection code was trying to catch encoding-related exceptions when the file is opened. This doesn't make sense, because at this point no data has been read, therefore no encoding errors can be detected. Instead, catch encoding-related exceptions when the file contents are read. Also avoid bailing out with `Exception('Unknown encoding')` on empty files.
1 parent ad64452 commit 900f186

File tree

2 files changed

+32
-22
lines changed

2 files changed

+32
-22
lines changed

codespell_lib/_codespell.py

+19-22
Original file line numberDiff line numberDiff line change
@@ -200,30 +200,27 @@ def open_with_chardet(self, filename):
200200
return lines, encoding
201201

202202
def open_with_internal(self, filename):
203-
curr = 0
204-
while True:
205-
try:
206-
f = codecs.open(filename, 'r', encoding=encodings[curr])
207-
except UnicodeDecodeError:
208-
if not self.quiet_level & QuietLevels.ENCODING:
209-
print("WARNING: Decoding file using encoding=%s failed: %s"
210-
% (encodings[curr], filename,), file=sys.stderr)
211-
try:
212-
print("WARNING: Trying next encoding %s"
213-
% encodings[curr + 1], file=sys.stderr)
214-
except IndexError:
215-
pass
216-
217-
curr += 1
218-
else:
219-
lines = f.readlines()
220-
f.close()
221-
break
222-
if not lines:
203+
encoding = None
204+
first_try = True
205+
for encoding in encodings:
206+
if first_try:
207+
first_try = False
208+
elif not self.quiet_level & QuietLevels.ENCODING:
209+
print("WARNING: Trying next encoding %s"
210+
% encoding, file=sys.stderr)
211+
with codecs.open(filename, 'r', encoding=encoding) as f:
212+
try:
213+
lines = f.readlines()
214+
except UnicodeDecodeError:
215+
if not self.quiet_level & QuietLevels.ENCODING:
216+
print("WARNING: Decoding file using encoding=%s "
217+
"failed: %s" % (encoding, filename,),
218+
file=sys.stderr)
219+
else:
220+
break
221+
else:
223222
raise Exception('Unknown encoding')
224223

225-
encoding = encodings[curr]
226-
227224
return lines, encoding
228225

229226
# -.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-.-:-.-:-.-:-.:-.-:-

codespell_lib/tests/test_basic.py

+13
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,19 @@ def test_encoding(tmpdir, capsys):
272272
with open(f.name, 'ab') as f:
273273
f.write(u'naieve\n'.encode('utf-8'))
274274
assert cs.main(f.name) == 1
275+
# Encoding detection (only try ISO 8859-1 because UTF-8 is the default)
276+
with open(f.name, 'wb') as f:
277+
f.write(b'Speling error, non-ASCII: h\xe9t\xe9rog\xe9n\xe9it\xe9\n')
278+
# check warnings about wrong encoding are enabled with "-q 0"
279+
code, stdout, stderr = cs.main('-q', '0', f.name, std=True, count=True)
280+
assert code == 1
281+
assert 'Speling' in stdout
282+
assert 'iso-8859-1' in stderr
283+
# check warnings about wrong encoding are disabled with "-q 1"
284+
code, stdout, stderr = cs.main('-q', '1', f.name, std=True, count=True)
285+
assert code == 1
286+
assert 'Speling' in stdout
287+
assert 'iso-8859-1' not in stderr
275288
# Binary file warning
276289
with open(f.name, 'wb') as f:
277290
f.write(b'\x00\x00naiive\x00\x00')

0 commit comments

Comments
 (0)