Skip to content

Commit 1d5cb4a

Browse files
committed
Merge pull request #7941 from ahlmss/set_item_to_none
API: Coerce None according to the dtype of the container
2 parents 12a39bb + 644fab6 commit 1d5cb4a

File tree

6 files changed

+224
-7
lines changed

6 files changed

+224
-7
lines changed

doc/source/missing_data.rst

+28
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,34 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``.
105105
df2
106106
df2.get_dtype_counts()
107107
108+
.. _missing.inserting:
109+
110+
Inserting missing data
111+
----------------------
112+
113+
You can insert missing values by simply assigning to containers. The
114+
actual missing value used will be chosen based on the dtype.
115+
116+
For example, numeric containers will always use ``NaN`` regardless of
117+
the missing value type chosen:
118+
119+
.. ipython:: python
120+
121+
s = Series([1, 2, 3])
122+
s.loc[0] = None
123+
s
124+
125+
Likewise, datetime containers will always use ``NaT``.
126+
127+
For object containers, pandas will use the value given:
128+
129+
.. ipython:: python
130+
131+
s = Series(["a", "b", "c"])
132+
s.loc[0] = None
133+
s.loc[1] = np.nan
134+
s
135+
108136
109137
Calculations with missing data
110138
------------------------------

doc/source/v0.15.0.txt

+25
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,31 @@ API changes
270270
idx.duplicated()
271271
idx.drop_duplicates()
272272

273+
- Assigning values to ``None`` now considers the dtype when choosing an 'empty' value (:issue:`7941`).
274+
275+
Previously, assigning to ``None`` in numeric containers changed the
276+
dtype to object (or errored, depending on the call). It now uses
277+
NaN:
278+
279+
.. ipython:: python
280+
281+
s = Series([1, 2, 3])
282+
s.loc[0] = None
283+
s
284+
285+
``NaT`` is now used similarly for datetime containers.
286+
287+
For object containers, we now preserve None values (previously these
288+
were converted to NaN values).
289+
290+
.. ipython:: python
291+
292+
s = Series(["a", "b", "c"])
293+
s.loc[0] = None
294+
s
295+
296+
To insert a NaN, you must explicitly use ``np.nan``. See the :ref:`docs <missing.inserting>`.
297+
273298
.. _whatsnew_0150.dt:
274299

275300
.dt accessor

pandas/core/common.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def _is_null_datelike_scalar(other):
374374
return isnull(other)
375375
return False
376376

377-
def array_equivalent(left, right):
377+
def array_equivalent(left, right, strict_nan=False):
378378
"""
379379
True if two arrays, left and right, have equal non-NaN elements, and NaNs in
380380
corresponding locations. False otherwise. It is assumed that left and right
@@ -385,6 +385,8 @@ def array_equivalent(left, right):
385385
Parameters
386386
----------
387387
left, right : ndarrays
388+
strict_nan : bool, default False
389+
If True, consider NaN and None to be different.
388390
389391
Returns
390392
-------
@@ -400,11 +402,32 @@ def array_equivalent(left, right):
400402
"""
401403
left, right = np.asarray(left), np.asarray(right)
402404
if left.shape != right.shape: return False
403-
# NaNs occur only in object arrays, float or complex arrays.
405+
406+
# Object arrays can contain None, NaN and NaT.
404407
if issubclass(left.dtype.type, np.object_):
405-
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
408+
409+
if not strict_nan:
410+
# pd.isnull considers NaN and None to be equivalent.
411+
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
412+
413+
for left_value, right_value in zip(left, right):
414+
if left_value is tslib.NaT and right_value is not tslib.NaT:
415+
return False
416+
417+
elif isinstance(left_value, float) and np.isnan(left_value):
418+
if not isinstance(right_value, float) or not np.isnan(right_value):
419+
return False
420+
else:
421+
if left_value != right_value:
422+
return False
423+
424+
return True
425+
426+
# NaNs can occur in float and complex arrays.
406427
if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
407428
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
429+
430+
# NaNs cannot occur otherwise.
408431
return np.array_equal(left, right)
409432

410433
def _iterable_not_string(x):

pandas/core/internals.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ def setitem(self, indexer, value):
494494
compatible shape
495495
"""
496496

497+
# coerce None values, if appropriate
498+
if value is None:
499+
if self.is_numeric:
500+
value = np.nan
501+
497502
# coerce args
498503
values, value = self._try_coerce_args(self.values, value)
499504
arr_value = np.array(value)
@@ -587,7 +592,7 @@ def putmask(self, mask, new, align=True, inplace=False):
587592
mask = mask.values.T
588593

589594
# if we are passed a scalar None, convert it here
590-
if not is_list_like(new) and isnull(new):
595+
if not is_list_like(new) and isnull(new) and not self.is_object:
591596
new = self.fill_value
592597

593598
if self._can_hold_element(new):

pandas/tests/test_indexing.py

+137-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import nose
33
import itertools
44
import warnings
5+
from datetime import datetime
56

67
from pandas.compat import range, lrange, lzip, StringIO, lmap, map
8+
from pandas.tslib import NaT
79
from numpy import nan
810
from numpy.random import randn
911
import numpy as np
@@ -14,7 +16,8 @@
1416
from pandas.core.api import (DataFrame, Index, Series, Panel, isnull,
1517
MultiIndex, Float64Index, Timestamp)
1618
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
17-
assert_frame_equal, assert_panel_equal)
19+
assert_frame_equal, assert_panel_equal,
20+
assert_attr_equal)
1821
from pandas import concat
1922

2023
import pandas.util.testing as tm
@@ -3816,6 +3819,139 @@ def test_float_index_non_scalar_assignment(self):
38163819
tm.assert_frame_equal(df,df2)
38173820

38183821

3822+
class TestSeriesNoneCoercion(tm.TestCase):
3823+
EXPECTED_RESULTS = [
3824+
# For numeric series, we should coerce to NaN.
3825+
([1, 2, 3], [np.nan, 2, 3]),
3826+
([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
3827+
3828+
# For datetime series, we should coerce to NaT.
3829+
([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3830+
[NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
3831+
3832+
# For objects, we should preserve the None value.
3833+
(["foo", "bar", "baz"], [None, "bar", "baz"]),
3834+
]
3835+
3836+
def test_coercion_with_setitem(self):
3837+
for start_data, expected_result in self.EXPECTED_RESULTS:
3838+
start_series = Series(start_data)
3839+
start_series[0] = None
3840+
3841+
expected_series = Series(expected_result)
3842+
3843+
assert_attr_equal('dtype', start_series, expected_series)
3844+
self.assert_numpy_array_equivalent(
3845+
start_series.values,
3846+
expected_series.values, strict_nan=True)
3847+
3848+
def test_coercion_with_loc_setitem(self):
3849+
for start_data, expected_result in self.EXPECTED_RESULTS:
3850+
start_series = Series(start_data)
3851+
start_series.loc[0] = None
3852+
3853+
expected_series = Series(expected_result)
3854+
3855+
assert_attr_equal('dtype', start_series, expected_series)
3856+
self.assert_numpy_array_equivalent(
3857+
start_series.values,
3858+
expected_series.values, strict_nan=True)
3859+
3860+
def test_coercion_with_setitem_and_series(self):
3861+
for start_data, expected_result in self.EXPECTED_RESULTS:
3862+
start_series = Series(start_data)
3863+
start_series[start_series == start_series[0]] = None
3864+
3865+
expected_series = Series(expected_result)
3866+
3867+
assert_attr_equal('dtype', start_series, expected_series)
3868+
self.assert_numpy_array_equivalent(
3869+
start_series.values,
3870+
expected_series.values, strict_nan=True)
3871+
3872+
def test_coercion_with_loc_and_series(self):
3873+
for start_data, expected_result in self.EXPECTED_RESULTS:
3874+
start_series = Series(start_data)
3875+
start_series.loc[start_series == start_series[0]] = None
3876+
3877+
expected_series = Series(expected_result)
3878+
3879+
assert_attr_equal('dtype', start_series, expected_series)
3880+
self.assert_numpy_array_equivalent(
3881+
start_series.values,
3882+
expected_series.values, strict_nan=True)
3883+
3884+
3885+
class TestDataframeNoneCoercion(tm.TestCase):
3886+
EXPECTED_SINGLE_ROW_RESULTS = [
3887+
# For numeric series, we should coerce to NaN.
3888+
([1, 2, 3], [np.nan, 2, 3]),
3889+
([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
3890+
3891+
# For datetime series, we should coerce to NaT.
3892+
([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3893+
[NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
3894+
3895+
# For objects, we should preserve the None value.
3896+
(["foo", "bar", "baz"], [None, "bar", "baz"]),
3897+
]
3898+
3899+
def test_coercion_with_loc(self):
3900+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3901+
start_dataframe = DataFrame({'foo': start_data})
3902+
start_dataframe.loc[0, ['foo']] = None
3903+
3904+
expected_dataframe = DataFrame({'foo': expected_result})
3905+
3906+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3907+
self.assert_numpy_array_equivalent(
3908+
start_dataframe['foo'].values,
3909+
expected_dataframe['foo'].values, strict_nan=True)
3910+
3911+
def test_coercion_with_setitem_and_dataframe(self):
3912+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3913+
start_dataframe = DataFrame({'foo': start_data})
3914+
start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
3915+
3916+
expected_dataframe = DataFrame({'foo': expected_result})
3917+
3918+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3919+
self.assert_numpy_array_equivalent(
3920+
start_dataframe['foo'].values,
3921+
expected_dataframe['foo'].values, strict_nan=True)
3922+
3923+
def test_none_coercion_loc_and_dataframe(self):
3924+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3925+
start_dataframe = DataFrame({'foo': start_data})
3926+
start_dataframe.loc[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
3927+
3928+
expected_dataframe = DataFrame({'foo': expected_result})
3929+
3930+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3931+
self.assert_numpy_array_equivalent(
3932+
start_dataframe['foo'].values,
3933+
expected_dataframe['foo'].values, strict_nan=True)
3934+
3935+
def test_none_coercion_mixed_dtypes(self):
3936+
start_dataframe = DataFrame({
3937+
'a': [1, 2, 3],
3938+
'b': [1.0, 2.0, 3.0],
3939+
'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3940+
'd': ['a', 'b', 'c']})
3941+
start_dataframe.iloc[0] = None
3942+
3943+
expected_dataframe = DataFrame({
3944+
'a': [np.nan, 2, 3],
3945+
'b': [np.nan, 2.0, 3.0],
3946+
'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
3947+
'd': [None, 'b', 'c']})
3948+
3949+
for column in expected_dataframe.columns:
3950+
assert_attr_equal('dtype', start_dataframe[column], expected_dataframe[column])
3951+
self.assert_numpy_array_equivalent(
3952+
start_dataframe[column].values,
3953+
expected_dataframe[column].values, strict_nan=True)
3954+
38193955

38203956
if __name__ == '__main__':
38213957
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/util/testing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def round_trip_pickle(self, obj, path=None):
105105
pd.to_pickle(obj, path)
106106
return pd.read_pickle(path)
107107

108-
def assert_numpy_array_equivalent(self, np_array, assert_equal):
108+
def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False):
109109
"""Checks that 'np_array' is equivalent to 'assert_equal'
110110
111111
Two numpy arrays are equivalent if the arrays have equal non-NaN elements, and
@@ -115,7 +115,7 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal):
115115
similar to `assert_numpy_array_equal()`. If the expected array includes `np.nan` use this
116116
function.
117117
"""
118-
if array_equivalent(np_array, assert_equal):
118+
if array_equivalent(np_array, assert_equal, strict_nan=strict_nan):
119119
return
120120
raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))
121121

0 commit comments

Comments
 (0)