Merge remote-tracking branch 'upstream/main' into tst/ref/test_sql

mroeschke · mroeschke · commit 3edbcb056220 · 2023-10-23T18:12:01.000-07:00
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -301,6 +301,7 @@ Categorical
 
 Datetimelike
 ^^^^^^^^^^^^
+- Bug in :func:`concat` raising ``AttributeError`` when concatenating all-NA DataFrame with :class:`DatetimeTZDtype` dtype DataFrame. (:issue:`52093`)
 - Bug in :meth:`DatetimeIndex.union` returning object dtype for tz-aware indexes with the same timezone but different units (:issue:`55238`)
 - Bug in :meth:`Tick.delta` with very large ticks raising ``OverflowError`` instead of ``OutOfBoundsTimedelta`` (:issue:`55503`)
 - Bug in adding or subtracting a :class:`Week` offset to a ``datetime64`` :class:`Series`, :class:`Index`, or :class:`DataFrame` column with non-nanosecond resolution returning incorrect results (:issue:`55583`)
@@ -345,9 +346,9 @@ Interval
 
 Indexing
 ^^^^^^^^
+- Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`)
 - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`)
 - Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`)
--
 
 Missing
 ^^^^^^^
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -993,7 +993,7 @@ cdef class TextReader:
             missing_usecols = [col for col in self.usecols if col >= num_cols]
             if missing_usecols:
                 raise ParserError(
-                    "Defining usecols without of bounds indices is not allowed. "
+                    "Defining usecols with out-of-bounds indices is not allowed. "
                     f"{missing_usecols} are out of bounds.",
                 )
 
diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
@@ -23,11 +23,6 @@ def try_parse_dates(
     values: npt.NDArray[np.object_],  # object[:]
     parser,
 ) -> npt.NDArray[np.object_]: ...
-def try_parse_year_month_day(
-    years: npt.NDArray[np.object_],  # object[:]
-    months: npt.NDArray[np.object_],  # object[:]
-    days: npt.NDArray[np.object_],  # object[:]
-) -> npt.NDArray[np.object_]: ...
 def guess_datetime_format(
     dt_str,
     dayfirst: bool | None = ...,
diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx
@@ -766,25 +766,6 @@ def try_parse_dates(object[:] values, parser) -> np.ndarray:
     return result.base  # .base to access underlying ndarray
 
 
-def try_parse_year_month_day(
-    object[:] years, object[:] months, object[:] days
-) -> np.ndarray:
-    cdef:
-        Py_ssize_t i, n
-        object[::1] result
-
-    n = len(years)
-    # TODO(cython3): Use len instead of `shape[0]`
-    if months.shape[0] != n or days.shape[0] != n:
-        raise ValueError("Length of years/months/days must all be equal")
-    result = np.empty(n, dtype="O")
-
-    for i in range(n):
-        result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
-
-    return result.base  # .base to access underlying ndarray
-
-
 # ----------------------------------------------------------------------
 # Miscellaneous
 
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2308,6 +2308,7 @@ def _sequence_to_dt64ns(
         # assume this data are epoch timestamps
         if data.dtype != INT64_DTYPE:
             data = data.astype(np.int64, copy=False)
+            copy = False
         result = data.view(out_dtype)
 
     if copy:
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1816,7 +1816,8 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
             if not isinstance(tipo, np.dtype):
                 # i.e. nullable IntegerDtype; we can put this into an ndarray
                 #  losslessly iff it has no NAs
-                if element._hasna:
+                arr = element._values if isinstance(element, ABCSeries) else element
+                if arr._hasna:
                     raise LossySetitemError
                 return element
 
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -26,6 +26,7 @@
     BlockPlacement,
     BlockValuesRefs,
 )
+from pandas._libs.tslibs import Timestamp
 from pandas.errors import PerformanceWarning
 from pandas.util._decorators import cache_readonly
 from pandas.util._exceptions import find_stack_level
@@ -2304,7 +2305,8 @@ def _preprocess_slice_or_indexer(
 def make_na_array(dtype: DtypeObj, shape: Shape, fill_value) -> ArrayLike:
     if isinstance(dtype, DatetimeTZDtype):
         # NB: exclude e.g. pyarrow[dt64tz] dtypes
-        i8values = np.full(shape, fill_value._value)
+        ts = Timestamp(fill_value).as_unit(dtype.unit)
+        i8values = np.full(shape, ts._value)
         return DatetimeArray(i8values, dtype=dtype)
 
     elif is_1d_only_ea_dtype(dtype):
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -28,8 +28,6 @@
     get_unit_from_dtype,
     iNaT,
     is_supported_unit,
-    nat_strings,
-    parsing,
     timezones as libtimezones,
 )
 from pandas._libs.tslibs.conversion import precision_from_unit
@@ -42,7 +40,6 @@
     AnyArrayLike,
     ArrayLike,
     DateTimeErrorChoices,
-    npt,
 )
 from pandas.util._exceptions import find_stack_level
 
@@ -62,14 +59,12 @@
     ABCDataFrame,
     ABCSeries,
 )
-from pandas.core.dtypes.missing import notna
 
 from pandas.arrays import (
     DatetimeArray,
     IntegerArray,
     NumpyExtensionArray,
 )
-from pandas.core import algorithms
 from pandas.core.algorithms import unique
 from pandas.core.arrays import ArrowExtensionArray
 from pandas.core.arrays.base import ExtensionArray
@@ -1273,58 +1268,6 @@ def coerce(values):
     return values
 
 
-def _attempt_YYYYMMDD(arg: npt.NDArray[np.object_], errors: str) -> np.ndarray | None:
-    """
-    try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like,
-    arg is a passed in as an object dtype, but could really be ints/strings
-    with nan-like/or floats (e.g. with nan)
-
-    Parameters
-    ----------
-    arg : np.ndarray[object]
-    errors : {'raise','ignore','coerce'}
-    """
-
-    def calc(carg):
-        # calculate the actual result
-        carg = carg.astype(object, copy=False)
-        parsed = parsing.try_parse_year_month_day(
-            carg / 10000, carg / 100 % 100, carg % 100
-        )
-        return tslib.array_to_datetime(parsed, errors=errors)[0]
-
-    def calc_with_mask(carg, mask):
-        result = np.empty(carg.shape, dtype="M8[ns]")
-        iresult = result.view("i8")
-        iresult[~mask] = iNaT
-
-        masked_result = calc(carg[mask].astype(np.float64).astype(np.int64))
-        result[mask] = masked_result.astype("M8[ns]")
-        return result
-
-    # try intlike / strings that are ints
-    try:
-        return calc(arg.astype(np.int64))
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    # a float with actual np.nan
-    try:
-        carg = arg.astype(np.float64)
-        return calc_with_mask(carg, notna(carg))
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    # string with NaN-like
-    try:
-        mask = ~algorithms.isin(arg, list(nat_strings))
-        return calc_with_mask(arg, mask)
-    except (ValueError, OverflowError, TypeError):
-        pass
-
-    return None
-
-
 __all__ = [
     "DateParseError",
     "should_cache",
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -615,8 +615,8 @@ def _handle_usecols(
                 ]
                 if missing_usecols:
                     raise ParserError(
-                        "Defining usecols without of bounds indices is not allowed. "
-                        f"{missing_usecols} are out of bounds.",
+                        "Defining usecols with out-of-bounds indices is not allowed. "
+                        f"{missing_usecols} are out-of-bounds.",
                     )
                 col_indices = self.usecols
 
diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py
@@ -8,7 +8,6 @@
 import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays import DatetimeArray
-from pandas.core.arrays.datetimes import _sequence_to_dt64ns
 
 
 class TestDatetimeArrayConstructor:
@@ -44,7 +43,6 @@ def test_freq_validation(self):
         "meth",
         [
             DatetimeArray._from_sequence,
-            _sequence_to_dt64ns,
             pd.to_datetime,
             pd.DatetimeIndex,
         ],
@@ -104,9 +102,6 @@ def test_bool_dtype_raises(self):
         with pytest.raises(TypeError, match=msg):
             DatetimeArray._from_sequence(arr)
 
-        with pytest.raises(TypeError, match=msg):
-            _sequence_to_dt64ns(arr)
-
         with pytest.raises(TypeError, match=msg):
             pd.DatetimeIndex(arr)
 
@@ -143,14 +138,12 @@ def test_tz_dtype_mismatch_raises(self):
             ["2000"], dtype=DatetimeTZDtype(tz="US/Central")
         )
         with pytest.raises(TypeError, match="data is already tz-aware"):
-            DatetimeArray._from_sequence_not_strict(
-                arr, dtype=DatetimeTZDtype(tz="UTC")
-            )
+            DatetimeArray._from_sequence(arr, dtype=DatetimeTZDtype(tz="UTC"))
 
     def test_tz_dtype_matches(self):
         dtype = DatetimeTZDtype(tz="US/Central")
         arr = DatetimeArray._from_sequence(["2000"], dtype=dtype)
-        result = DatetimeArray._from_sequence_not_strict(arr, dtype=dtype)
+        result = DatetimeArray._from_sequence(arr, dtype=dtype)
         tm.assert_equal(arr, result)
 
     @pytest.mark.parametrize("order", ["F", "C"])
@@ -160,13 +153,6 @@ def test_2d(self, order):
         if order == "F":
             arr = arr.T
 
-        res = _sequence_to_dt64ns(arr)
-        expected = _sequence_to_dt64ns(arr.ravel())
-
-        tm.assert_numpy_array_equal(res[0].ravel(), expected[0])
-        assert res[1] == expected[1]
-        assert res[2] == expected[2]
-
         res = DatetimeArray._from_sequence(arr)
         expected = DatetimeArray._from_sequence(arr.ravel()).reshape(arr.shape)
         tm.assert_datetime_array_equal(res, expected)
diff --git a/pandas/tests/arrays/datetimes/test_cumulative.py b/pandas/tests/arrays/datetimes/test_cumulative.py
@@ -7,40 +7,35 @@
 class TestAccumulator:
     def test_accumulators_freq(self):
         # GH#50297
-        arr = DatetimeArray._from_sequence_not_strict(
+        arr = DatetimeArray._from_sequence(
             [
                 "2000-01-01",
                 "2000-01-02",
                 "2000-01-03",
-            ],
-            freq="D",
-        )
+            ]
+        )._with_freq("infer")
         result = arr._accumulate("cummin")
-        expected = DatetimeArray._from_sequence_not_strict(
-            ["2000-01-01"] * 3, freq=None
-        )
+        expected = DatetimeArray._from_sequence(["2000-01-01"] * 3)
         tm.assert_datetime_array_equal(result, expected)
 
         result = arr._accumulate("cummax")
-        expected = DatetimeArray._from_sequence_not_strict(
+        expected = DatetimeArray._from_sequence(
             [
                 "2000-01-01",
                 "2000-01-02",
                 "2000-01-03",
             ],
-            freq=None,
         )
         tm.assert_datetime_array_equal(result, expected)
 
     @pytest.mark.parametrize("func", ["cumsum", "cumprod"])
     def test_accumulators_disallowed(self, func):
         # GH#50297
-        arr = DatetimeArray._from_sequence_not_strict(
+        arr = DatetimeArray._from_sequence(
             [
                 "2000-01-01",
                 "2000-01-02",
             ],
-            freq="D",
-        )
+        )._with_freq("infer")
         with pytest.raises(TypeError, match=f"Accumulation {func}"):
             arr._accumulate(func)
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
@@ -27,8 +27,6 @@
     PeriodArray,
     TimedeltaArray,
 )
-from pandas.core.arrays.datetimes import _sequence_to_dt64ns
-from pandas.core.arrays.timedeltas import sequence_to_td64ns
 
 
 # TODO: more freq variants
@@ -1314,11 +1312,6 @@ def test_from_pandas_array(dtype):
     expected = cls._from_sequence(data)
     tm.assert_extension_array_equal(result, expected)
 
-    func = {"M8[ns]": _sequence_to_dt64ns, "m8[ns]": sequence_to_td64ns}[dtype]
-    result = func(arr)[0]
-    expected = func(data)[0]
-    tm.assert_equal(result, expected)
-
     func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
     result = func(arr).array
     expected = func(data).array
diff --git a/pandas/tests/arrays/timedeltas/test_cumulative.py b/pandas/tests/arrays/timedeltas/test_cumulative.py
@@ -7,13 +7,13 @@
 class TestAccumulator:
     def test_accumulators_disallowed(self):
         # GH#50297
-        arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"])
+        arr = TimedeltaArray._from_sequence(["1D", "2D"])
         with pytest.raises(TypeError, match="cumprod not supported"):
             arr._accumulate("cumprod")
 
     def test_cumsum(self):
         # GH#50297
-        arr = TimedeltaArray._from_sequence_not_strict(["1D", "2D"])
+        arr = TimedeltaArray._from_sequence(["1D", "2D"])
         result = arr._accumulate("cumsum")
-        expected = TimedeltaArray._from_sequence_not_strict(["1D", "3D"])
+        expected = TimedeltaArray._from_sequence(["1D", "3D"])
         tm.assert_timedelta_array_equal(result, expected)
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -761,6 +761,17 @@ def test_setitem_frame_midx_columns(self):
         df[col_name] = df[[col_name]]
         tm.assert_frame_equal(df, expected)
 
+    def test_loc_setitem_ea_dtype(self):
+        # GH#55604
+        df = DataFrame({"a": np.array([10], dtype="i8")})
+        df.loc[:, "a"] = Series([11], dtype="Int64")
+        expected = DataFrame({"a": np.array([11], dtype="i8")})
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame({"a": np.array([10], dtype="i8")})
+        df.iloc[:, 0] = Series([11], dtype="Int64")
+        tm.assert_frame_equal(df, expected)
+
 
 class TestSetitemTZAwareValues:
     @pytest.fixture
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -74,20 +74,17 @@ def test_explicit_tz_none(self):
         with pytest.raises(ValueError, match=msg):
             DatetimeIndex([], dtype="M8[ns, UTC]", tz=None)
 
-    @pytest.mark.parametrize(
-        "dt_cls", [DatetimeIndex, DatetimeArray._from_sequence_not_strict]
-    )
-    def test_freq_validation_with_nat(self, dt_cls):
+    def test_freq_validation_with_nat(self):
         # GH#11587 make sure we get a useful error message when generate_range
         #  raises
         msg = (
             "Inferred frequency None from passed values does not conform "
             "to passed frequency D"
         )
         with pytest.raises(ValueError, match=msg):
-            dt_cls([pd.NaT, Timestamp("2011-01-01")], freq="D")
+            DatetimeIndex([pd.NaT, Timestamp("2011-01-01")], freq="D")
         with pytest.raises(ValueError, match=msg):
-            dt_cls([pd.NaT, Timestamp("2011-01-01")._value], freq="D")
+            DatetimeIndex([pd.NaT, Timestamp("2011-01-01")._value], freq="D")
 
     # TODO: better place for tests shared by DTI/TDI?
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py
diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py

Original file line number	Diff line number	Diff line change
`@@ -993,7 +993,7 @@ cdef class TextReader:`
`993`	`993`	`missing_usecols = [col for col in self.usecols if col >= num_cols]`
`994`	`994`	`if missing_usecols:`
`995`	`995`	`raise ParserError(`
`996`		`- "Defining usecols without of bounds indices is not allowed. "`
	`996`	`+ "Defining usecols with out-of-bounds indices is not allowed. "`
`997`	`997`	`f"{missing_usecols} are out of bounds.",`
`998`	`998`	`)`
`999`	`999`
Original file line number	Diff line number	Diff line change
`@@ -615,8 +615,8 @@ def _handle_usecols(`
`615`	`615`	`]`
`616`	`616`	`if missing_usecols:`
`617`	`617`	`raise ParserError(`
`618`		`- "Defining usecols without of bounds indices is not allowed. "`
`619`		`- f"{missing_usecols} are out of bounds.",`
	`618`	`+ "Defining usecols with out-of-bounds indices is not allowed. "`
	`619`	`+ f"{missing_usecols} are out-of-bounds.",`
`620`	`620`	`)`
`621`	`621`	`col_indices = self.usecols`
`622`	`622`