Skip to content

DF.__setitem__ creates extension column when given extension scalar #34875

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 42 commits into from
Jul 11, 2020
Merged
Show file tree
Hide file tree
Changes from 39 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0ec5911
Bugfix to make DF.__setitem__ create extension column instead of obje…
justinessert Jun 19, 2020
9336955
removed bad whitespace
justinessert Jun 19, 2020
01fb076
Apply suggestions from code review
justinessert Jun 22, 2020
5c8b356
added missing :
justinessert Jun 22, 2020
2c1f640
modified cast_extension_scalar_to_array test to include an Interval type
justinessert Jun 22, 2020
d509bf4
added user-facing test for extension type bug
justinessert Jun 22, 2020
e231bb1
fixed pep8 issues
justinessert Jun 22, 2020
18ed043
added note about bug in setting series to scalar extension type
justinessert Jun 22, 2020
a6b18f4
corrected order of imports
justinessert Jun 22, 2020
cbc29be
corrected order of imports
justinessert Jun 22, 2020
2f79822
fixed black formatting errors
justinessert Jun 22, 2020
0f9178e
removed extra comma
justinessert Jun 22, 2020
bfa18fb
updated cast_scalar_to_arr to support tuple shape for extension dtype
justinessert Jun 23, 2020
e7e9a48
removed unneeded code
justinessert Jun 23, 2020
291eb2d
added coverage for datetime with timezone in extension_array test
justinessert Jun 23, 2020
3a788ed
added TODO
justinessert Jun 23, 2020
38d7ce5
correct line that was too long
justinessert Jun 23, 2020
a5e8df5
fixed dtype issue with tz test
justinessert Jun 23, 2020
5e439bd
creating distinct arrays for each column
justinessert Jun 24, 2020
6cc7959
resolving mypy error
justinessert Jun 24, 2020
7e27a6e
added docstring info and test
justinessert Jun 24, 2020
90a8570
removed unneeded import
justinessert Jun 24, 2020
39b2984
flattened else case in init
justinessert Jun 26, 2020
7a01041
refactored extension type column fix
justinessert Jun 26, 2020
03e528b
reverted docstring changes
justinessert Jun 26, 2020
7bb9553
reverted docstring changes
justinessert Jun 26, 2020
a3be9a6
removed unneeded imports
justinessert Jun 26, 2020
3a92164
reverted test changes
justinessert Jun 26, 2020
c93a847
fixed construct_1d_arraylike bug
justinessert Jun 26, 2020
966283a
reorganized if statements
justinessert Jun 30, 2020
f2aea7b
moved what's new statement to correct file
justinessert Jun 30, 2020
6495a36
created new test for period df construction
justinessert Jun 30, 2020
42e7afa
added assert_frame_equal to period_data test
justinessert Jun 30, 2020
8343df3
Using pandas array instead of df constructor for better test
justinessert Jul 7, 2020
a50a42c
changed wording
justinessert Jul 7, 2020
3452c20
Merge branch 'master' of https://github.com/justinessert/pandas
justinessert Jul 7, 2020
6f3fb51
pylint fixes
justinessert Jul 7, 2020
b95cdfc
parameterized test and added comment
justinessert Jul 8, 2020
6830fde
removed extra comma
justinessert Jul 8, 2020
6653ef8
Merge branch 'master' into master
justinessert Jul 10, 2020
c73a2de
parameterized test
justinessert Jul 10, 2020
100f334
renamed test
justinessert Jul 10, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,7 @@ ExtensionArray
- Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`)
- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`)
- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`)
- Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`)

Other
^^^^^
Expand Down
49 changes: 37 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
from pandas.core.dtypes.cast import (
cast_scalar_to_array,
coerce_to_dtypes,
construct_1d_arraylike_from_scalar,
find_common_type,
infer_dtype_from_scalar,
invalidate_string_dtypes,
Expand Down Expand Up @@ -514,25 +515,43 @@ def __init__(
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
# For data is scalar
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
"DataFrame constructor called with "
f"incompatible data and dtype: {err}"
)
raise exc from err
if index is None or columns is None:
raise ValueError("DataFrame constructor not properly called!")

if not dtype:
dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)

# For data is a scalar extension dtype
if is_extension_array_dtype(dtype):

values = [
construct_1d_arraylike_from_scalar(data, len(index), dtype)
for _ in range(len(columns))
]
mgr = arrays_to_mgr(values, columns, index, columns, dtype=None)
else:
# Attempt to coerce to a numpy array
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
"DataFrame constructor called with "
f"incompatible data and dtype: {err}"
)
raise exc from err

if arr.ndim != 0:
raise ValueError("DataFrame constructor not properly called!")

if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array(
(len(index), len(columns)), data, dtype=dtype
)

mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
)
else:
raise ValueError("DataFrame constructor not properly called!")

NDFrame.__init__(self, mgr)

Expand Down Expand Up @@ -3730,7 +3749,13 @@ def reindexer(value):
infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True)

# upcast
value = cast_scalar_to_array(len(self.index), value)
if is_extension_array_dtype(infer_dtype):
value = construct_1d_arraylike_from_scalar(
value, len(self.index), infer_dtype
)
else:
value = cast_scalar_to_array(len(self.index), value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you not do this inside cast_scalar_to_array (pass in the infer_dtype optionally)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you clarify what you mean by "this"? Your last comments were to move this whole if/else logic out of cast_scalar_to_array, which I do agree made for a much cleaner solution

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of adding this if/then clause, can you change cast_scalar_to_array to just handle if the dtype is an extension_array? (and obviously pass infer_dtype here)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cast_scalar_to_array only deals with numpy arrays. Initially this PR added handling extension dtypes in there as well, but after discussion we actually moved it out because it gave a lot of complications (eg EAs don't support 2D arrays, while cast_scalar_to_array generally does).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

exactly the reason to make the change. we are almost certainly going to need to do this in other places and this is a natural fit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be more specific, it would be easy if cast_scalar_array was simply returning a single 1d array. But in most cases it is not, it is returning a 2d array.
Since there's no good way to make a 2d Extension array, I handled this by returning a list of Extension arrays. Is this what your asking I go back to doing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We cannot "simply move the if_extenseion array case into cast_scalar_arry", because this is a specific case where cast_scalar_array is receiving an int to its shape parameter. But in general, shape is a tuple to define the shape of the 2d array.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/cast.py#L1492

you just need to move the if_extension_dtype into this function. I don't think there is ambiguity or anything. This returns exactly 1 ndarray. I am not sure where 2D even matters here at all.

Copy link
Contributor Author

@justinessert justinessert Jul 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A single ndarray can be 2 dimensional, and in this case it often is, for example, when it is used here

Doing this in cast_scalar_to_array would NOT work without a larger refactor.

if is_extension_array_dtype(dtype):
    value = construct_1d_arraylike_from_scalar(fill_value, shape, dtype)
else:
    values = np.empty(shape, dtype=dtype)
    values.fill(fill_value)

this shape parameter is usually a tuple not an int like it is here, so construct_1d_arraylike_from_scalar would fail when shape=(2, 5) for example (since extension arrays are only 1d)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"This returns exactly 1 ndarray" is technically true, but it can be a 2d ndarray

For example

values = np.empty((2,3), dtype=int)
values.fill(2)

would give you a numpy array like

[[2, 2, 2], [2, 2, 2]]


value = maybe_cast_to_datetime(value, infer_dtype)

# return internal types directly
Expand Down
33 changes: 32 additions & 1 deletion pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
import numpy as np
import pytest

from pandas import Categorical, DataFrame, Index, Series, Timestamp, date_range
from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype

from pandas import (
Categorical,
DataFrame,
Index,
Interval,
Period,
Series,
Timestamp,
date_range,
)
import pandas._testing as tm
from pandas.core.arrays import SparseArray

Expand Down Expand Up @@ -150,3 +161,23 @@ def test_setitem_dict_preserves_dtypes(self):
"c": float(b),
}
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
"obj,dtype",
[
(Period("2020-01"), PeriodDtype("M")),
(Interval(left=0, right=5), IntervalDtype("int64")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
),
],
)
def test_setitem_extension_types(self, obj, dtype):
# GH: 34832
expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)})

df = DataFrame({"idx": [1, 2, 3]})
df["obj"] = obj

tm.assert_frame_equal(df, expected)
8 changes: 7 additions & 1 deletion pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,14 @@ def test_combine_first_timezone(self):
columns=["UTCdatetime", "abc"],
data=data1,
index=pd.date_range("20140627", periods=1),
dtype="object",
)
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
df2 = pd.DataFrame(
columns=["UTCdatetime", "xyz"],
data=data2,
index=pd.date_range("20140628", periods=1),
dtype="object",
)
res = df2[["UTCdatetime"]].combine_first(df1)
exp = pd.DataFrame(
Expand All @@ -217,10 +219,14 @@ def test_combine_first_timezone(self):
},
columns=["UTCdatetime", "abc"],
index=pd.date_range("20140627", periods=2, freq="D"),
dtype="object",
)
tm.assert_frame_equal(res, exp)
assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
assert res["abc"].dtype == "datetime64[ns, UTC]"
# Need to cast all to "obejct" because combine_first does not retain dtypes:
# GH Issue 7509
res = res.astype("object")
tm.assert_frame_equal(res, exp)

# see gh-10567
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
Expand Down
16 changes: 15 additions & 1 deletion pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas.compat.numpy import _is_numpy_dev

from pandas.core.dtypes.common import is_integer_dtype
from pandas.core.dtypes.dtypes import PeriodDtype

import pandas as pd
from pandas import (
Expand Down Expand Up @@ -710,7 +711,7 @@ def create_data(constructor):
tm.assert_frame_equal(result_timedelta, expected)
tm.assert_frame_equal(result_Timedelta, expected)

def test_constructor_period(self):
def test_constructor_period_dict(self):
# PeriodIndex
a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M")
b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D")
Expand All @@ -723,6 +724,19 @@ def test_constructor_period(self):
assert df["a"].dtype == a.dtype
assert df["b"].dtype == b.dtype

def test_constructor_period_data(self):
# GH 34832
data = pd.Period("2012-01", freq="M")
df = DataFrame(index=[0, 1], columns=["a", "b"], data=data)

assert df["a"].dtype == PeriodDtype("M")
assert df["b"].dtype == PeriodDtype("M")

arr = pd.array([data] * 2, dtype=PeriodDtype("M"))
expected = DataFrame({"a": arr, "b": arr})

tm.assert_frame_equal(df, expected)

def test_nested_dict_frame_constructor(self):
rng = pd.period_range("1/1/2000", periods=5)
df = DataFrame(np.random.randn(10, 5), columns=rng)
Expand Down