Merge branch 'main' into add-mask-fillna

sanggon6107 · sanggon6107 · commit 6fd89863e90a · 2025-01-25T17:01:28.000+09:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -146,15 +146,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin PR02" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.calendar GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.holidays GL08" \
-        -i "pandas.tseries.offsets.CustomBusinessMonthBegin.is_on_offset SA01" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.m_offset GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.n GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.normalize GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthBegin.weekmask GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd PR02" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd.calendar GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd.holidays GL08" \
-        -i "pandas.tseries.offsets.CustomBusinessMonthEnd.is_on_offset SA01" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd.m_offset GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd.n GL08" \
         -i "pandas.tseries.offsets.CustomBusinessMonthEnd.normalize GL08" \
@@ -191,7 +189,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.Hour.is_on_offset GL08" \
         -i "pandas.tseries.offsets.Hour.n GL08" \
         -i "pandas.tseries.offsets.Hour.normalize GL08" \
-        -i "pandas.tseries.offsets.LastWeekOfMonth SA01" \
         -i "pandas.tseries.offsets.LastWeekOfMonth.is_on_offset GL08" \
         -i "pandas.tseries.offsets.LastWeekOfMonth.n GL08" \
         -i "pandas.tseries.offsets.LastWeekOfMonth.normalize GL08" \
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -720,11 +720,24 @@ cdef class BaseOffset:
         """
         Return boolean whether a timestamp intersects with this frequency.
 
+        This method determines if a given timestamp aligns with the start
+        of a custom business month, as defined by this offset. It accounts
+        for custom rules, such as skipping weekends or other non-business days,
+        and checks whether the provided datetime falls on a valid business day
+        that marks the beginning of the custom business month.
+
         Parameters
         ----------
         dt : datetime.datetime
             Timestamp to check intersections with frequency.
 
+        See Also
+        --------
+        tseries.offsets.CustomBusinessMonthBegin : Represents the start of a custom
+            business month.
+        tseries.offsets.CustomBusinessMonthEnd : Represents the end of a custom
+            business month.
+
         Examples
         --------
         >>> ts = pd.Timestamp(2022, 1, 1)
@@ -3710,6 +3723,15 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin):
         - 5 is Saturday
         - 6 is Sunday.
 
+    See Also
+    --------
+    tseries.offsets.WeekOfMonth :
+        Date offset for a specific weekday in a month.
+    tseries.offsets.MonthEnd :
+        Date offset for the end of the month.
+    tseries.offsets.BMonthEnd :
+        Date offset for the last business day of the month.
+
     Examples
     --------
     >>> ts = pd.Timestamp(2022, 1, 1)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1398,7 +1398,7 @@ def _to_datetimearray(self) -> DatetimeArray:
         np_dtype = np.dtype(f"M8[{pa_type.unit}]")
         dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
         np_array = self._pa_array.to_numpy()
-        np_array = np_array.astype(np_dtype)
+        np_array = np_array.astype(np_dtype, copy=False)
         return DatetimeArray._simple_new(np_array, dtype=dtype)
 
     def _to_timedeltaarray(self) -> TimedeltaArray:
@@ -1409,7 +1409,7 @@ def _to_timedeltaarray(self) -> TimedeltaArray:
         assert pa.types.is_duration(pa_type)
         np_dtype = np.dtype(f"m8[{pa_type.unit}]")
         np_array = self._pa_array.to_numpy()
-        np_array = np_array.astype(np_dtype)
+        np_array = np_array.astype(np_dtype, copy=False)
         return TimedeltaArray._simple_new(np_array, dtype=np_dtype)
 
     def _values_for_json(self) -> np.ndarray:
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -165,7 +165,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
                 # The only way self.names is not the same length as number of cols is
                 # if we have int index_col. We should just pad the names(they will get
                 # removed anyways) to expected length then.
-                self.names = list(range(num_cols - len(self.names))) + self.names
+                columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
+                self.names = columns_prefix + self.names
                 multi_index_named = False
             frame.columns = self.names
 
diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
@@ -33,7 +33,7 @@ def float_string_frame():
     df = DataFrame(
         np.random.default_rng(2).standard_normal((30, 4)),
         index=Index([f"foo_{i}" for i in range(30)], dtype=object),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
     )
     df["foo"] = "bar"
     return df
diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py
@@ -108,7 +108,6 @@ def test_constructor_list_of_series(self):
         expected = DataFrame.from_dict(sdict, orient="index")
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
     def test_constructor_orient(self, float_string_frame):
         data_dict = float_string_frame.T._series
         recons = DataFrame.from_dict(data_dict, orient="index")
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -162,21 +160,7 @@ def test_constructor_with_convert(self):
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_construction_with_mixed(self, float_string_frame, using_infer_string):
-        # test construction edge cases with mixed types
-
-        # f7u12, this does not work without extensive workaround
-        data = [
-            [datetime(2001, 1, 5), np.nan, datetime(2001, 1, 2)],
-            [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)],
-        ]
-        df = DataFrame(data)
-
-        # check dtypes
-        result = df.dtypes
-        expected = Series({"datetime64[us]": 3})
-
         # mixed-type frames
         float_string_frame["datetime"] = datetime.now()
         float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
@@ -196,13 +180,11 @@ def test_construction_with_mixed(self, float_string_frame, using_infer_string):
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_construction_with_conversions(self):
         # convert from a numpy array of non-ns timedelta64; as of 2.0 this does
         #  *not* convert
         arr = np.array([1, 2, 3], dtype="timedelta64[s]")
-        df = DataFrame(index=range(3))
-        df["A"] = arr
+        df = DataFrame({"A": arr})
         expected = DataFrame(
             {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
         )
@@ -220,11 +202,11 @@ def test_construction_with_conversions(self):
         assert expected.dtypes["dt1"] == "M8[s]"
         assert expected.dtypes["dt2"] == "M8[s]"
 
-        df = DataFrame(index=range(3))
-        df["dt1"] = np.datetime64("2013-01-01")
-        df["dt2"] = np.array(
+        dt1 = np.datetime64("2013-01-01")
+        dt2 = np.array(
             ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
         )
+        df = DataFrame({"dt1": dt1, "dt2": dt2})
 
         # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
         # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
@@ -401,14 +383,17 @@ def test_update_inplace_sets_valid_block_values():
     assert isinstance(df._mgr.blocks[0].values, Categorical)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_nonconsolidated_item_cache_take():
     # https://github.com/pandas-dev/pandas/issues/35521
 
     # create non-consolidated dataframe with object dtype columns
-    df = DataFrame()
-    df["col1"] = Series(["a"], dtype=object)
+    df = DataFrame(
+        {
+            "col1": Series(["a"], dtype=object),
+        }
+    )
     df["col2"] = Series([0], dtype=object)
+    assert not df._mgr.is_consolidated()
 
     # access column (item cache)
     df["col1"] == "A"
diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -76,18 +74,16 @@ def test_size_series_masked_type_returns_Int64(dtype):
     tm.assert_series_equal(result, expected)
 
 
-# TODO(infer_string) in case the column is object dtype, it should preserve that dtype
-# for the result's index
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_size_strings(any_string_dtype):
+def test_size_strings(any_string_dtype, using_infer_string):
     # GH#55627
     dtype = any_string_dtype
     df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
     result = df.groupby("a")["b"].size()
     exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
+    exp_index_dtype = "str" if using_infer_string and dtype == "object" else dtype
     expected = Series(
         [2, 1],
-        index=Index(["a", "b"], name="a", dtype=dtype),
+        index=Index(["a", "b"], name="a", dtype=exp_index_dtype),
         name="b",
         dtype=exp_dtype,
     )
diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py
@@ -90,9 +90,6 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
 def test_multi_index_no_level_names(
     request, all_parsers, index_col, using_infer_string
 ):
-    if using_infer_string and all_parsers.engine == "pyarrow":
-        # result should have string columns instead of object dtype
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
     data = """index1,index2,A,B,C,D
 foo,one,2,3,4,5
 foo,two,7,8,9,10
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -104,10 +104,7 @@ def fp(request):
 
 @pytest.fixture
 def df_compat():
-    # TODO(infer_string) should this give str columns?
-    return pd.DataFrame(
-        {"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"], dtype=object)
-    )
+    return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"]))
 
 
 @pytest.fixture
@@ -686,7 +683,11 @@ def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
         with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
             httpserver.serve_content(content=f.read())
             df = read_parquet(httpserver.url, engine=engine)
-        tm.assert_frame_equal(df, df_compat)
+
+        expected = df_compat
+        if pa_version_under19p0:
+            expected.columns = expected.columns.astype(object)
+        tm.assert_frame_equal(df, expected)
 
 
 class TestParquetPyArrow(Base):

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ def float_string_frame():`
`33`	`33`	`df = DataFrame(`
`34`	`34`	`np.random.default_rng(2).standard_normal((30, 4)),`
`35`	`35`	`index=Index([f"foo_{i}" for i in range(30)], dtype=object),`
`36`		`- columns=Index(list("ABCD"), dtype=object),`
	`36`	`+ columns=Index(list("ABCD")),`
`37`	`37`	`)`
`38`	`38`	`df["foo"] = "bar"`
`39`	`39`	`return df`