Skip to content

Commit 6e979d8

Browse files
vnlitvinovjreback
authored andcommitted
BUG: reading windows utf8 filenames in py3.6 (pandas-dev#25769)
* Fix pandas-devgh-15086 properly instead of making a workaround * fix code style * Make sure test_filename_with_special_chars properly tests combinations of chars Updated whatsnew * Address comments by @jreback * Parametrize test_filename_with_special_chars Use CP-1252 and CP-1251 filenames separately, skip the test on Windows on < 3.6 as it won't pass
1 parent 4663951 commit 6e979d8

File tree

4 files changed

+40
-7
lines changed

4 files changed

+40
-7
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,7 @@ I/O
271271
- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
272272
- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
273273
- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
274+
- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
274275
-
275276

276277

pandas/_libs/parsers.pyx

+1-5
Original file line numberDiff line numberDiff line change
@@ -678,11 +678,7 @@ cdef class TextReader:
678678

679679
if isinstance(source, basestring):
680680
if not isinstance(source, bytes):
681-
if compat.PY36 and compat.is_platform_windows():
682-
# see gh-15086.
683-
encoding = "mbcs"
684-
else:
685-
encoding = sys.getfilesystemencoding() or "utf-8"
681+
encoding = sys.getfilesystemencoding() or "utf-8"
686682

687683
source = source.encode(encoding)
688684

pandas/_libs/src/parser/io.c

+33
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software.
1717
#define O_BINARY 0
1818
#endif // O_BINARY
1919

20+
#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32)
21+
#define USE_WIN_UTF16
22+
#include <Windows.h>
23+
#endif
24+
2025
/*
2126
On-disk FILE, uncompressed
2227
*/
@@ -27,7 +32,35 @@ void *new_file_source(char *fname, size_t buffer_size) {
2732
return NULL;
2833
}
2934

35+
#ifdef USE_WIN_UTF16
36+
// Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API
37+
// accepts. This is needed because UTF8 might _not_ be convertible to MBCS
38+
// for some conditions, as MBCS is locale-dependent, and not all unicode
39+
// symbols can be expressed in it.
40+
{
41+
wchar_t* wname = NULL;
42+
int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
43+
if (required == 0) {
44+
free(fs);
45+
return NULL;
46+
}
47+
wname = (wchar_t*)malloc(required * sizeof(wchar_t));
48+
if (wname == NULL) {
49+
free(fs);
50+
return NULL;
51+
}
52+
if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
53+
required) {
54+
free(wname);
55+
free(fs);
56+
return NULL;
57+
}
58+
fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
59+
free(wname);
60+
}
61+
#else
3062
fs->fd = open(fname, O_RDONLY | O_BINARY);
63+
#endif
3164
if (fs->fd == -1) {
3265
free(fs);
3366
return NULL;

pandas/tests/io/parser/test_common.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1904,12 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys):
19041904
assert captured.err == ""
19051905

19061906

1907-
def test_filename_with_special_chars(all_parsers):
1907+
@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36,
1908+
reason="On Python < 3.6 won't pass on Windows")
1909+
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"])
1910+
def test_filename_with_special_chars(all_parsers, filename):
19081911
# see gh-15086.
19091912
parser = all_parsers
19101913
df = DataFrame({"a": [1, 2, 3]})
19111914

1912-
with tm.ensure_clean("sé-es-vé.csv") as path:
1915+
with tm.ensure_clean(filename) as path:
19131916
df.to_csv(path, index=False)
19141917

19151918
result = parser.read_csv(path)

0 commit comments

Comments
 (0)