Skip to content

Commit 7f0cbc9

Browse files
vnlitvinovanmyachev
authored andcommitted
BUG: reading windows utf8 filenames in py3.6 (pandas-dev#25769)
* Fix pandas-devgh-15086 properly instead of making a workaround * fix code style * Make sure test_filename_with_special_chars properly tests combinations of chars Updated whatsnew * Address comments by @jreback * Parametrize test_filename_with_special_chars Use CP-1252 and CP-1251 filenames separately, skip the test on Windows on < 3.6 as it won't pass
1 parent cb00deb commit 7f0cbc9

File tree

3 files changed

+39
-7
lines changed

3 files changed

+39
-7
lines changed

pandas/_libs/parsers.pyx

+1-5
Original file line numberDiff line numberDiff line change
@@ -677,11 +677,7 @@ cdef class TextReader:
677677

678678
if isinstance(source, basestring):
679679
if not isinstance(source, bytes):
680-
if compat.PY36 and compat.is_platform_windows():
681-
# see gh-15086.
682-
encoding = "mbcs"
683-
else:
684-
encoding = sys.getfilesystemencoding() or "utf-8"
680+
encoding = sys.getfilesystemencoding() or "utf-8"
685681

686682
source = source.encode(encoding)
687683

pandas/_libs/src/parser/io.c

+33
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software.
1717
#define O_BINARY 0
1818
#endif /* O_BINARY */
1919

20+
#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32)
21+
#define USE_WIN_UTF16
22+
#include <Windows.h>
23+
#endif
24+
2025
/*
2126
On-disk FILE, uncompressed
2227
*/
@@ -27,7 +32,35 @@ void *new_file_source(char *fname, size_t buffer_size) {
2732
return NULL;
2833
}
2934

35+
#ifdef USE_WIN_UTF16
36+
// Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API
37+
// accepts. This is needed because UTF8 might _not_ be convertible to MBCS
38+
// for some conditions, as MBCS is locale-dependent, and not all unicode
39+
// symbols can be expressed in it.
40+
{
41+
wchar_t* wname = NULL;
42+
int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
43+
if (required == 0) {
44+
free(fs);
45+
return NULL;
46+
}
47+
wname = (wchar_t*)malloc(required * sizeof(wchar_t));
48+
if (wname == NULL) {
49+
free(fs);
50+
return NULL;
51+
}
52+
if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
53+
required) {
54+
free(wname);
55+
free(fs);
56+
return NULL;
57+
}
58+
fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
59+
free(wname);
60+
}
61+
#else
3062
fs->fd = open(fname, O_RDONLY | O_BINARY);
63+
#endif
3164
if (fs->fd == -1) {
3265
free(fs);
3366
return NULL;

pandas/tests/io/parser/test_common.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1904,12 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys):
19041904
assert captured.err == ""
19051905

19061906

1907-
def test_filename_with_special_chars(all_parsers):
1907+
@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36,
1908+
reason="On Python < 3.6 won't pass on Windows")
1909+
@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"])
1910+
def test_filename_with_special_chars(all_parsers, filename):
19081911
# see gh-15086.
19091912
parser = all_parsers
19101913
df = DataFrame({"a": [1, 2, 3]})
19111914

1912-
with tm.ensure_clean("sé-es-vé.csv") as path:
1915+
with tm.ensure_clean(filename) as path:
19131916
df.to_csv(path, index=False)
19141917

19151918
result = parser.read_csv(path)

0 commit comments

Comments
 (0)