diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 798a3d838ef7e..9795e58995ab5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1144,6 +1144,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) - Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). Other diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d2200cb45c6e..bc09761803b75 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -519,25 +519,43 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: mgr = init_dict({}, index, columns, dtype=dtype) + # For data is scalar else: - try: - arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as err: - exc = TypeError( - "DataFrame constructor called with " - f"incompatible data and dtype: {err}" - ) - raise exc from err + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) + + # For data is a scalar extension dtype + if is_extension_array_dtype(dtype): + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + else: + # Attempt to coerce to a numpy array + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + + if arr.ndim != 0: + raise ValueError("DataFrame constructor not properly called!") - if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array( (len(index), len(columns)), data, dtype=dtype ) + mgr = init_ndarray( values, index, columns, dtype=values.dtype, copy=False ) - else: - raise ValueError("DataFrame constructor not properly called!") NDFrame.__init__(self, mgr) @@ -3739,7 +3757,13 @@ def reindexer(value): infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) # upcast - value = cast_scalar_to_array(len(self.index), value) + if is_extension_array_dtype(infer_dtype): + value = construct_1d_arraylike_from_scalar( + value, len(self.index), infer_dtype + ) + else: + value = cast_scalar_to_array(len(self.index), value) + value = maybe_cast_to_datetime(value, infer_dtype) # return internal types directly diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 8fcdae95fbab5..9bb5338f1e07f 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1,7 +1,18 @@ import numpy as np import pytest -from pandas import Categorical, DataFrame, Index, Series, Timestamp, date_range +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype + +from pandas import ( + Categorical, + DataFrame, + Index, + Interval, + Period, + Series, + Timestamp, + date_range, +) import pandas._testing as tm from pandas.core.arrays import SparseArray @@ -150,3 +161,23 @@ def test_setitem_dict_preserves_dtypes(self): "c": float(b), } tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "obj,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + ], + ) + def test_setitem_extension_types(self, obj, dtype): + # GH: 34832 + expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)}) + + df = DataFrame({"idx": [1, 2, 3]}) + df["obj"] = obj + + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 7715cb1cb6eec..78f265d32f8df 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -199,12 +199,14 @@ def test_combine_first_timezone(self): columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), + dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") df2 = pd.DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), + dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) exp = pd.DataFrame( @@ -217,10 +219,14 @@ def test_combine_first_timezone(self): }, columns=["UTCdatetime", "abc"], index=pd.date_range("20140627", periods=2, freq="D"), + dtype="object", ) - tm.assert_frame_equal(res, exp) assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" assert res["abc"].dtype == "datetime64[ns, UTC]" + # Need to cast all to "obejct" because combine_first does not retain dtypes: + # GH Issue 7509 + res = res.astype("object") + tm.assert_frame_equal(res, exp) # see gh-10567 dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ab4f7781467e7..64ae29e6de63c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,13 +14,16 @@ from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype import pandas as pd from pandas import ( Categorical, DataFrame, Index, + Interval, MultiIndex, + Period, RangeIndex, Series, Timedelta, @@ -700,7 +703,7 @@ def create_data(constructor): tm.assert_frame_equal(result_timedelta, expected) tm.assert_frame_equal(result_Timedelta, expected) - def test_constructor_period(self): + def test_constructor_period_dict(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") @@ -713,6 +716,29 @@ def test_constructor_period(self): assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype + @pytest.mark.parametrize( + "data,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + ], + ) + def test_constructor_extension_scalar_data(self, data, dtype): + # GH 34832 + df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) + + assert df["a"].dtype == dtype + assert df["b"].dtype == dtype + + arr = pd.array([data] * 2, dtype=dtype) + expected = DataFrame({"a": arr, "b": arr}) + + tm.assert_frame_equal(df, expected) + def test_nested_dict_frame_constructor(self): rng = pd.period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng)