BUG: reading windows utf8 filenames in py3.6 (pandas-dev#25769)

vnlitvinov · anmyachev · commit 7f0cbc9577ba · 2019-04-18T14:58:00.000+03:00
* Fix pandas-devgh-15086 properly instead of making a workaround * fix code style * Make sure test_filename_with_special_chars properly tests combinations of chars Updated whatsnew * Address comments by @jreback * Parametrize test_filename_with_special_chars Use CP-1252 and CP-1251 filenames separately, skip the test on Windows on < 3.6 as it won't pass
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -677,11 +677,7 @@ cdef class TextReader:
 
         if isinstance(source, basestring):
             if not isinstance(source, bytes):
-                if compat.PY36 and compat.is_platform_windows():
-                    # see gh-15086.
-                    encoding = "mbcs"
-                else:
-                    encoding = sys.getfilesystemencoding() or "utf-8"
+                encoding = sys.getfilesystemencoding() or "utf-8"
 
                 source = source.encode(encoding)
 
diff --git a/pandas/_libs/src/parser/io.c b/pandas/_libs/src/parser/io.c
@@ -17,6 +17,11 @@ The full license is in the LICENSE file, distributed with this software.
 #define O_BINARY 0
 #endif /* O_BINARY */
 
+#if PY_VERSION_HEX >= 0x03060000 && defined(_WIN32)
+#define USE_WIN_UTF16
+#include <Windows.h>
+#endif
+
 /*
   On-disk FILE, uncompressed
 */
@@ -27,7 +32,35 @@ void *new_file_source(char *fname, size_t buffer_size) {
         return NULL;
     }
 
+#ifdef USE_WIN_UTF16
+    // Fix gh-15086 properly - convert UTF8 to UTF16 that Windows widechar API
+    // accepts. This is needed because UTF8 might _not_ be convertible to MBCS
+    // for some conditions, as MBCS is locale-dependent, and not all unicode
+    // symbols can be expressed in it.
+    {
+        wchar_t* wname = NULL;
+        int required = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0);
+        if (required == 0) {
+            free(fs);
+            return NULL;
+        }
+        wname = (wchar_t*)malloc(required * sizeof(wchar_t));
+        if (wname == NULL) {
+            free(fs);
+            return NULL;
+        }
+        if (MultiByteToWideChar(CP_UTF8, 0, fname, -1, wname, required) <
+                                                                required) {
+            free(wname);
+            free(fs);
+            return NULL;
+        }
+        fs->fd = _wopen(wname, O_RDONLY | O_BINARY);
+        free(wname);
+    }
+#else
     fs->fd = open(fname, O_RDONLY | O_BINARY);
+#endif
     if (fs->fd == -1) {
         free(fs);
         return NULL;
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
@@ -1904,12 +1904,15 @@ def test_suppress_error_output(all_parsers, capsys):
     assert captured.err == ""
 
 
-def test_filename_with_special_chars(all_parsers):
+@pytest.mark.skipif(compat.is_platform_windows() and not compat.PY36,
+                    reason="On Python < 3.6 won't pass on Windows")
+@pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv"])
+def test_filename_with_special_chars(all_parsers, filename):
     # see gh-15086.
     parser = all_parsers
     df = DataFrame({"a": [1, 2, 3]})
 
-    with tm.ensure_clean("sé-es-vé.csv") as path:
+    with tm.ensure_clean(filename) as path:
         df.to_csv(path, index=False)
 
         result = parser.read_csv(path)