diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7628c53cefa06..c9210a5597d48 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1537,6 +1537,7 @@ Missing - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) - :func:`Series.isin` now treats all NaN-floats as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) - :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for ``np.object``-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) +- :func:`DataFrame` and :func:`Series` now properly handle numpy masked arrays with hardened masks. Previously, constructing a DataFrame or Series from a masked array with a hard mask would create a pandas object containing the underlying value, rather than the expected NaN. (:issue:`24574`) MultiIndex diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6aa3117570af..76d3d704497b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -400,6 +400,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b3c893c7d84be..446ad72ac4a53 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -547,6 +547,7 @@ def sanitize_array(data, index, dtype=None, copy=False, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8a5ec1a16d1df..c8b3f23db1492 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -757,6 +757,28 @@ def test_constructor_maskedarray_nonfloat(self): assert frame['A'][1] is True assert frame['C'][2] is False + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) + expected = pd.DataFrame({ + 'A': [np.nan, np.nan], + 'B': [np.nan, np.nan]}, + columns=['A', 'B'], + index=[1, 2], + dtype=float) + tm.assert_frame_equal(result, expected) + # Check case where mask is hard but no data are masked + mat_hard = ma.ones((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) + expected = pd.DataFrame({ + 'A': [1.0, 1.0], + 'B': [1.0, 1.0]}, + columns=['A', 'B'], + index=[1, 2], + dtype=float) + tm.assert_frame_equal(result, expected) + def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f5a445e2cca9a..667065d09758b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -451,6 +451,13 @@ def test_constructor_maskedarray(self): datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + data = ma.masked_all((3, ), dtype=float).harden_mask() + result = pd.Series(data) + expected = pd.Series([nan, nan, nan]) + tm.assert_series_equal(result, expected) + def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = {k: 1 for k in rng}