BUG: reading windows utf8 filenames in py3.6 (pandas-dev#25769)

vnlitvinov · jreback · commit 6e979d86b33d · 2019-03-20T08:22:38.000-04:00
* Fix pandas-devgh-15086 properly instead of making a workaround * fix code style * Make sure test_filename_with_special_chars properly tests combinations of chars Updated whatsnew * Address comments by @jreback * Parametrize test_filename_with_special_chars Use CP-1252 and CP-1251 filenames separately, skip the test on Windows on < 3.6 as it won't pass
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -271,6 +271,7 @@ I/O
 - Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
 - :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
 - Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
+- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
 -
 
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -678,11 +678,7 @@ cdef class TextReader:
 
         if isinstance(source, basestring):
             if not isinstance(source, bytes):
-                if compat.PY36 and compat.is_platform_windows():
-                    # see gh-15086.
-                    encoding = "mbcs"
-                else:
-                    encoding = sys.getfilesystemencoding() or "utf-8"
+                encoding = sys.getfilesystemencoding() or "utf-8"
 
                 source = source.encode(encoding)
 
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
@@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software.
 #define O_BINARY 0
 #endif  // O_BINARY
 
+#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32)
+#define USE_WIN_UTF16
+#include <Windows.h>
+#endif
+
 /*
   On-disk FILE, uncompressed
 */
@@ -27,7 +32,35 @@ void *new_file_source(char *fname, size_t buffer_size) {
         return NULL;
     }
 
+#ifdef USE_WIN_UTF16
+    // Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API
+    // accepts. This is needed because UTF8 might _not_ be convertible to MBCS
+    // for some conditions, as MBCS is locale-dependent, and not all unicode
+    // symbols can be expressed in it.
+    {
+        wchar_t* wname = NULL;
+        int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+        if (required == 0) {
+            free(fs);
+            return NULL;
+        }
+        wname = (wchar_t*)malloc(required * sizeof(wchar_t));
+        if (wname == NULL) {
+            free(fs);
+            return NULL;
+        }
+        if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
+                                                                required) {
+            free(wname);
+            free(fs);
+            return NULL;
+        }
+        fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
+        free(wname);
+    }
+#else
     fs->fd = open(fname, O_RDONLY | O_BINARY);
+#endif
     if (fs->fd == -1) {
         free(fs);
         return NULL;
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -1904,12 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys):
     assert captured.err == ""
 
 
-def test_filename_with_special_chars(all_parsers):
+@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36,
+                    reason="On Python < 3.6 won't pass on Windows")
+@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"])
+def test_filename_with_special_chars(all_parsers, filename):
     # see gh-15086.
     parser = all_parsers
     df = DataFrame({"a": [1, 2, 3]})
 
-    with tm.ensure_clean("sé-es-vé.csv") as path:
+    with tm.ensure_clean(filename) as path:
         df.to_csv(path, index=False)
 
         result = parser.read_csv(path)

Original file line number	Diff line number	Diff line change
`@@ -271,6 +271,7 @@ I/O`
`271`	`271`	- Bug in :func:`json_normalize` for ``errors='ignore'`` where missing values in the input data, were filled in resulting ``DataFrame`` with the string "nan" instead of ``numpy.nan`` (:issue:`25468`)
`272`	`272`	- :meth:`DataFrame.to_html` now raises ``TypeError`` when using an invalid type for the ``classes`` parameter instead of ``AsseertionError`` (:issue:`25608`)
`273`	`273`	- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would lead to incorrect output when the ``header`` keyword is used (:issue:`16718`)
	`274`	+- Bug in :func:`read_csv` not properly interpreting the UTF8 encoded filenames on Windows on Python 3.6+ (:issue:`15086`)
`274`	`275`	`-`
`275`	`276`
`276`	`277`