From b20389d19a0461d1fdf006204f0b869ada617e3b Mon Sep 17 00:00:00 2001 From: karldw Date: Wed, 2 Jan 2019 18:24:30 -0800 Subject: [PATCH 1/3] Support hard-masked numpy arrays --- pandas/core/frame.py | 1 + pandas/core/internals/construction.py | 1 + pandas/tests/frame/test_constructors.py | 14 ++++++++++++++ pandas/tests/series/test_constructors.py | 5 +++++ 4 files changed, 21 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d6aa3117570af..dda9fffacd91c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -400,6 +400,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b3c893c7d84be..906d2861924bd 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -547,6 +547,7 @@ def sanitize_array(data, index, dtype=None, copy=False, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 8a5ec1a16d1df..0e795e99629e7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -757,6 +757,20 @@ def test_constructor_maskedarray_nonfloat(self): assert frame['A'][1] is True assert frame['C'][2] is False + # Check hardened masks + mat_hard = ma.masked_all((2, 3), dtype=float).harden_mask() + frame = DataFrame(mat_hard, columns=['A', 'B', 'C'], index=[1, 2]) + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + assert np.all(~np.asarray(frame == frame)) + # Check case where mask is hard but no data are masked + mat_hard = ma.ones((2,3), dtype=float).harden_mask() + frame = DataFrame(mat_hard, columns=['A', 'B', 'C'], index=[1, 2]) + assert len(frame.index) == 2 + assert len(frame.columns) == 3 + assert np.all(np.asarray(frame == 1.0)) + + def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index f5a445e2cca9a..5bd10fa7989ed 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -386,6 +386,11 @@ def test_constructor_maskedarray(self): expected = Series([nan, nan, nan]) assert_series_equal(result, expected) + data_hard = ma.copy(data).harden_mask() + result = Series(data_hard) + expected = Series([nan, nan, nan]) + assert_series_equal(result, expected) + data[0] = 0.0 data[2] = 2.0 index = ['a', 'b', 'c'] From 3ff25bdfcc63812afb0288afb12c3720e7b3cdce Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 3 Jan 2019 07:45:21 -0800 Subject: [PATCH 2/3] Fix issues --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/frame.py | 2 +- pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 34 +++++++++++++++--------- pandas/tests/series/test_constructors.py | 12 +++++---- 5 files changed, 32 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 7628c53cefa06..c9210a5597d48 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1537,6 +1537,7 @@ Missing - Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) - :func:`Series.isin` now treats all NaN-floats as equal also for ``np.object``-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) - :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for ``np.object``-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) +- :func:`DataFrame` and :func:`Series` now properly handle numpy masked arrays with hardened masks. Previously, constructing a DataFrame or Series from a masked array with a hard mask would create a pandas object containing the underlying value, rather than the expected NaN. (:issue:`24574`) MultiIndex diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dda9fffacd91c..76d3d704497b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -400,7 +400,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 906d2861924bd..446ad72ac4a53 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -547,7 +547,7 @@ def sanitize_array(data, index, dtype=None, copy=False, mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) - data.soften_mask() # set hardmask False if it was True + data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0e795e99629e7..42044e700a45e 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -757,19 +757,29 @@ def test_constructor_maskedarray_nonfloat(self): assert frame['A'][1] is True assert frame['C'][2] is False - # Check hardened masks - mat_hard = ma.masked_all((2, 3), dtype=float).harden_mask() - frame = DataFrame(mat_hard, columns=['A', 'B', 'C'], index=[1, 2]) - assert len(frame.index) == 2 - assert len(frame.columns) == 3 - assert np.all(~np.asarray(frame == frame)) + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) + expected = pd.DataFrame({ + 'A': [np.nan, np.nan], + 'B': [np.nan, np.nan], + }, + columns=['A', 'B'], + index=[1, 2], + dtype=float) + tm.assert_frame_equal(result, expected) # Check case where mask is hard but no data are masked - mat_hard = ma.ones((2,3), dtype=float).harden_mask() - frame = DataFrame(mat_hard, columns=['A', 'B', 'C'], index=[1, 2]) - assert len(frame.index) == 2 - assert len(frame.columns) == 3 - assert np.all(np.asarray(frame == 1.0)) - + mat_hard = ma.ones((2, 2), dtype=float).harden_mask() + result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) + expected = pd.DataFrame({ + 'A': [1.0, 1.0], + 'B': [1.0, 1.0], + }, + columns=['A', 'B'], + index=[1, 2], + dtype=float) + tm.assert_frame_equal(result, expected) def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5bd10fa7989ed..667065d09758b 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -386,11 +386,6 @@ def test_constructor_maskedarray(self): expected = Series([nan, nan, nan]) assert_series_equal(result, expected) - data_hard = ma.copy(data).harden_mask() - result = Series(data_hard) - expected = Series([nan, nan, nan]) - assert_series_equal(result, expected) - data[0] = 0.0 data[2] = 2.0 index = ['a', 'b', 'c'] @@ -456,6 +451,13 @@ def test_constructor_maskedarray(self): datetime(2001, 1, 3)], index=index, dtype='M8[ns]') assert_series_equal(result, expected) + def test_constructor_maskedarray_hardened(self): + # Check numpy masked arrays with hard masks -- from GH24574 + data = ma.masked_all((3, ), dtype=float).harden_mask() + result = pd.Series(data) + expected = pd.Series([nan, nan, nan]) + tm.assert_series_equal(result, expected) + def test_series_ctor_plus_datetimeindex(self): rng = date_range('20090415', '20090519', freq='B') data = {k: 1 for k in rng} From ad01a21827739bb2b0c26d5e59dc806deba57a6a Mon Sep 17 00:00:00 2001 From: karldw Date: Thu, 3 Jan 2019 15:30:39 -0800 Subject: [PATCH 3/3] Fix flake8 issues --- pandas/tests/frame/test_constructors.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 42044e700a45e..c8b3f23db1492 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -763,8 +763,7 @@ def test_constructor_maskedarray_hardened(self): result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) expected = pd.DataFrame({ 'A': [np.nan, np.nan], - 'B': [np.nan, np.nan], - }, + 'B': [np.nan, np.nan]}, columns=['A', 'B'], index=[1, 2], dtype=float) @@ -774,8 +773,7 @@ def test_constructor_maskedarray_hardened(self): result = pd.DataFrame(mat_hard, columns=['A', 'B'], index=[1, 2]) expected = pd.DataFrame({ 'A': [1.0, 1.0], - 'B': [1.0, 1.0], - }, + 'B': [1.0, 1.0]}, columns=['A', 'B'], index=[1, 2], dtype=float)