diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 301e7804ddbd8..9b3b3df9cbd0c 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -42,18 +42,26 @@ jobs: - name: "Minimum Versions" env_file: actions-38-minimum_versions.yaml pattern: "not slow and not network and not single_cpu" - - name: "Locale: it_IT.utf8" + - name: "Locale: it_IT" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. lang: "it_IT.utf8" lc_all: "it_IT.utf8" - - name: "Locale: zh_CN.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + - name: "Locale: zh_CN" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. lang: "zh_CN.utf8" lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" - name: "Copy-on-Write" env_file: actions-310.yaml pattern: "not slow and not network and not single_cpu" @@ -148,6 +156,12 @@ jobs: # xsel for clipboard tests run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: | + sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + - name: Set up Conda uses: ./.github/actions/setup-conda with: diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index c393b8a57f805..9d7e4c4dd5200 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -175,7 +175,7 @@ I/O Period ^^^^^^ -- +- Bug in :meth:`Period.strftime` and :meth:`PeriodIndex.strftime`, raising ``UnicodeDecodeError`` when a locale-specific directive was passed (:issue:`46319`) - Plotting diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d2d4838bfafc0..25f6386001ef6 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1162,7 +1162,8 @@ cdef str period_format(int64_t value, int freq, object fmt=None): return "NaT" if isinstance(fmt, str): - fmt = fmt.encode("utf-8") + # Encode using current locale, in case fmt contains non-utf8 chars + fmt = util.string_encode_locale(fmt) if fmt is None: freq_group = get_freq_group(freq) @@ -1231,7 +1232,8 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): # Execute c_strftime to process the usual datetime directives formatted = c_strftime(&dts, fmt) - result = util.char_to_string(formatted) + # Decode result according to current locale + result = util.char_to_string_locale(formatted) free(formatted) # Now we will fill the placeholders corresponding to our additional directives diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 492b7d519551f..a28aace5d2f15 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -26,6 +26,10 @@ cdef extern from "Python.h": const char* PyUnicode_AsUTF8AndSize(object obj, Py_ssize_t* length) except NULL + object PyUnicode_EncodeLocale(object obj, const char *errors) nogil + object PyUnicode_DecodeLocale(const char *str, const char *errors) nogil + + from numpy cimport ( float64_t, int64_t, @@ -219,3 +223,13 @@ cdef inline const char* get_c_string_buf_and_size(str py_string, cdef inline const char* get_c_string(str py_string) except NULL: return get_c_string_buf_and_size(py_string, NULL) + + +cdef inline bytes string_encode_locale(str py_string): + """As opposed to PyUnicode_Encode, use current system locale to encode.""" + return PyUnicode_EncodeLocale(py_string, NULL) + + +cdef inline object char_to_string_locale(const char* data): + """As opposed to PyUnicode_FromString, use current system locale to decode.""" + return PyUnicode_DecodeLocale(data, NULL) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 443aa743a6444..4f8ab78c13766 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1,9 +1,14 @@ """ Test output formatting for Series/DataFrame, including to_string & reprs """ -from datetime import datetime +from contextlib import nullcontext +from datetime import ( + datetime, + time, +) from io import StringIO import itertools +import locale from operator import methodcaller import os from pathlib import Path @@ -46,6 +51,13 @@ use_32bit_repr = is_platform_windows() or not IS64 +def get_local_am_pm(): + """Return the AM and PM strings returned by strftime in current locale.""" + am_local = time(1).strftime("%p") + pm_local = time(13).strftime("%p") + return am_local, pm_local + + @pytest.fixture(params=["string", "pathlike", "buffer"]) def filepath_or_buffer_id(request): """ @@ -3219,6 +3231,67 @@ def test_period_tz(self): per = dt.to_period(freq="H") assert per.format()[0] == "2013-01-01 00:00" + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_period_non_ascii_fmt(self, locale_str): + # GH#46468 non-ascii char in input format string leads to wrong output + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Scalar + per = pd.Period("2018-03-11 13:00", freq="H") + assert per.strftime("%y é") == "18 é" + + # Index + per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") + formatted = per.format(date_format="%y é") + assert formatted[0] == "03 é" + assert formatted[1] == "03 é" + + @pytest.mark.parametrize( + "locale_str", + [ + pytest.param(None, id=str(locale.getlocale())), + "it_IT.utf8", + "it_IT", # Note: encoding will be 'ISO8859-1' + "zh_CN.utf8", + "zh_CN", # Note: encoding will be 'gb2312' + ], + ) + def test_period_custom_locale_directive(self, locale_str): + # GH#46319 locale-specific directive leads to non-utf8 c strftime char* result + + # Skip if locale cannot be set + if locale_str is not None and not tm.can_set_locale(locale_str, locale.LC_ALL): + pytest.skip(f"Skipping as locale '{locale_str}' cannot be set on host.") + + # Change locale temporarily for this test. + with tm.set_locale(locale_str, locale.LC_ALL) if locale_str else nullcontext(): + # Get locale-specific reference + am_local, pm_local = get_local_am_pm() + + # Scalar + per = pd.Period("2018-03-11 13:00", freq="H") + assert per.strftime("%p") == pm_local + + # Index + per = pd.period_range("2003-01-01 01:00:00", periods=2, freq="12h") + formatted = per.format(date_format="%y %I:%M:%S%p") + assert formatted[0] == f"03 01:00:00{am_local}" + assert formatted[1] == f"03 01:00:00{pm_local}" + class TestDatetimeIndexFormat: def test_datetime(self):