Skip to content

Commit 644fab6

Browse files
committed
Coerce None consistently, depending on the dtype of the container.
Fixes pandas-dev#7939.
1 parent 3e7292c commit 644fab6

File tree

6 files changed

+224
-7
lines changed

6 files changed

+224
-7
lines changed

doc/source/missing_data.rst

+28
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,34 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``.
105105
df2
106106
df2.get_dtype_counts()
107107
108+
.. _missing.inserting:
109+
110+
Inserting missing data
111+
----------------------
112+
113+
You can insert missing values by simply assigning to containers. The
114+
actual missing value used will be chosen based on the dtype.
115+
116+
For example, numeric containers will always use ``NaN`` regardless of
117+
the missing value type chosen:
118+
119+
.. ipython:: python
120+
121+
s = Series([1, 2, 3])
122+
s.loc[0] = None
123+
s
124+
125+
Likewise, datetime containers will always use ``NaT``.
126+
127+
For object containers, pandas will use the value given:
128+
129+
.. ipython:: python
130+
131+
s = Series(["a", "b", "c"])
132+
s.loc[0] = None
133+
s.loc[1] = np.nan
134+
s
135+
108136
109137
Calculations with missing data
110138
------------------------------

doc/source/v0.15.0.txt

+25
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,31 @@ API changes
232232
idx.duplicated()
233233
idx.drop_duplicates()
234234

235+
- Assigning values to ``None`` now considers the dtype when choosing an 'empty' value (:issue:`7941`).
236+
237+
Previously, assigning to ``None`` in numeric containers changed the
238+
dtype to object (or errored, depending on the call). It now uses
239+
NaN:
240+
241+
.. ipython:: python
242+
243+
s = Series([1, 2, 3])
244+
s.loc[0] = None
245+
s
246+
247+
``NaT`` is now used similarly for datetime containers.
248+
249+
For object containers, we now preserve None values (previously these
250+
were converted to NaN values).
251+
252+
.. ipython:: python
253+
254+
s = Series(["a", "b", "c"])
255+
s.loc[0] = None
256+
s
257+
258+
To insert a NaN, you must explicitly use ``np.nan``. See the :ref:`docs <missing.inserting>`.
259+
235260
.. _whatsnew_0150.dt:
236261

237262
.dt accessor

pandas/core/common.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ def _is_null_datelike_scalar(other):
368368
return isnull(other)
369369
return False
370370

371-
def array_equivalent(left, right):
371+
def array_equivalent(left, right, strict_nan=False):
372372
"""
373373
True if two arrays, left and right, have equal non-NaN elements, and NaNs in
374374
corresponding locations. False otherwise. It is assumed that left and right
@@ -379,6 +379,8 @@ def array_equivalent(left, right):
379379
Parameters
380380
----------
381381
left, right : ndarrays
382+
strict_nan : bool, default False
383+
If True, consider NaN and None to be different.
382384
383385
Returns
384386
-------
@@ -394,11 +396,32 @@ def array_equivalent(left, right):
394396
"""
395397
left, right = np.asarray(left), np.asarray(right)
396398
if left.shape != right.shape: return False
397-
# NaNs occur only in object arrays, float or complex arrays.
399+
400+
# Object arrays can contain None, NaN and NaT.
398401
if issubclass(left.dtype.type, np.object_):
399-
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
402+
403+
if not strict_nan:
404+
# pd.isnull considers NaN and None to be equivalent.
405+
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
406+
407+
for left_value, right_value in zip(left, right):
408+
if left_value is tslib.NaT and right_value is not tslib.NaT:
409+
return False
410+
411+
elif isinstance(left_value, float) and np.isnan(left_value):
412+
if not isinstance(right_value, float) or not np.isnan(right_value):
413+
return False
414+
else:
415+
if left_value != right_value:
416+
return False
417+
418+
return True
419+
420+
# NaNs can occur in float and complex arrays.
400421
if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
401422
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
423+
424+
# NaNs cannot occur otherwise.
402425
return np.array_equal(left, right)
403426

404427
def _iterable_not_string(x):

pandas/core/internals.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,11 @@ def setitem(self, indexer, value):
494494
compatible shape
495495
"""
496496

497+
# coerce None values, if appropriate
498+
if value is None:
499+
if self.is_numeric:
500+
value = np.nan
501+
497502
# coerce args
498503
values, value = self._try_coerce_args(self.values, value)
499504
arr_value = np.array(value)
@@ -587,7 +592,7 @@ def putmask(self, mask, new, align=True, inplace=False):
587592
mask = mask.values.T
588593

589594
# if we are passed a scalar None, convert it here
590-
if not is_list_like(new) and isnull(new):
595+
if not is_list_like(new) and isnull(new) and not self.is_object:
591596
new = self.fill_value
592597

593598
if self._can_hold_element(new):

pandas/tests/test_indexing.py

+137-1
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
import nose
33
import itertools
44
import warnings
5+
from datetime import datetime
56

67
from pandas.compat import range, lrange, lzip, StringIO, lmap, map
8+
from pandas.tslib import NaT
79
from numpy import nan
810
from numpy.random import randn
911
import numpy as np
@@ -14,7 +16,8 @@
1416
from pandas.core.api import (DataFrame, Index, Series, Panel, isnull,
1517
MultiIndex, Float64Index, Timestamp)
1618
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
17-
assert_frame_equal, assert_panel_equal)
19+
assert_frame_equal, assert_panel_equal,
20+
assert_attr_equal)
1821
from pandas import concat
1922

2023
import pandas.util.testing as tm
@@ -3816,6 +3819,139 @@ def test_float_index_non_scalar_assignment(self):
38163819
tm.assert_frame_equal(df,df2)
38173820

38183821

3822+
class TestSeriesNoneCoercion(tm.TestCase):
3823+
EXPECTED_RESULTS = [
3824+
# For numeric series, we should coerce to NaN.
3825+
([1, 2, 3], [np.nan, 2, 3]),
3826+
([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
3827+
3828+
# For datetime series, we should coerce to NaT.
3829+
([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3830+
[NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
3831+
3832+
# For objects, we should preserve the None value.
3833+
(["foo", "bar", "baz"], [None, "bar", "baz"]),
3834+
]
3835+
3836+
def test_coercion_with_setitem(self):
3837+
for start_data, expected_result in self.EXPECTED_RESULTS:
3838+
start_series = Series(start_data)
3839+
start_series[0] = None
3840+
3841+
expected_series = Series(expected_result)
3842+
3843+
assert_attr_equal('dtype', start_series, expected_series)
3844+
self.assert_numpy_array_equivalent(
3845+
start_series.values,
3846+
expected_series.values, strict_nan=True)
3847+
3848+
def test_coercion_with_loc_setitem(self):
3849+
for start_data, expected_result in self.EXPECTED_RESULTS:
3850+
start_series = Series(start_data)
3851+
start_series.loc[0] = None
3852+
3853+
expected_series = Series(expected_result)
3854+
3855+
assert_attr_equal('dtype', start_series, expected_series)
3856+
self.assert_numpy_array_equivalent(
3857+
start_series.values,
3858+
expected_series.values, strict_nan=True)
3859+
3860+
def test_coercion_with_setitem_and_series(self):
3861+
for start_data, expected_result in self.EXPECTED_RESULTS:
3862+
start_series = Series(start_data)
3863+
start_series[start_series == start_series[0]] = None
3864+
3865+
expected_series = Series(expected_result)
3866+
3867+
assert_attr_equal('dtype', start_series, expected_series)
3868+
self.assert_numpy_array_equivalent(
3869+
start_series.values,
3870+
expected_series.values, strict_nan=True)
3871+
3872+
def test_coercion_with_loc_and_series(self):
3873+
for start_data, expected_result in self.EXPECTED_RESULTS:
3874+
start_series = Series(start_data)
3875+
start_series.loc[start_series == start_series[0]] = None
3876+
3877+
expected_series = Series(expected_result)
3878+
3879+
assert_attr_equal('dtype', start_series, expected_series)
3880+
self.assert_numpy_array_equivalent(
3881+
start_series.values,
3882+
expected_series.values, strict_nan=True)
3883+
3884+
3885+
class TestDataframeNoneCoercion(tm.TestCase):
3886+
EXPECTED_SINGLE_ROW_RESULTS = [
3887+
# For numeric series, we should coerce to NaN.
3888+
([1, 2, 3], [np.nan, 2, 3]),
3889+
([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
3890+
3891+
# For datetime series, we should coerce to NaT.
3892+
([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3893+
[NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]),
3894+
3895+
# For objects, we should preserve the None value.
3896+
(["foo", "bar", "baz"], [None, "bar", "baz"]),
3897+
]
3898+
3899+
def test_coercion_with_loc(self):
3900+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3901+
start_dataframe = DataFrame({'foo': start_data})
3902+
start_dataframe.loc[0, ['foo']] = None
3903+
3904+
expected_dataframe = DataFrame({'foo': expected_result})
3905+
3906+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3907+
self.assert_numpy_array_equivalent(
3908+
start_dataframe['foo'].values,
3909+
expected_dataframe['foo'].values, strict_nan=True)
3910+
3911+
def test_coercion_with_setitem_and_dataframe(self):
3912+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3913+
start_dataframe = DataFrame({'foo': start_data})
3914+
start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
3915+
3916+
expected_dataframe = DataFrame({'foo': expected_result})
3917+
3918+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3919+
self.assert_numpy_array_equivalent(
3920+
start_dataframe['foo'].values,
3921+
expected_dataframe['foo'].values, strict_nan=True)
3922+
3923+
def test_none_coercion_loc_and_dataframe(self):
3924+
for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS:
3925+
start_dataframe = DataFrame({'foo': start_data})
3926+
start_dataframe.loc[start_dataframe['foo'] == start_dataframe['foo'][0]] = None
3927+
3928+
expected_dataframe = DataFrame({'foo': expected_result})
3929+
3930+
assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo'])
3931+
self.assert_numpy_array_equivalent(
3932+
start_dataframe['foo'].values,
3933+
expected_dataframe['foo'].values, strict_nan=True)
3934+
3935+
def test_none_coercion_mixed_dtypes(self):
3936+
start_dataframe = DataFrame({
3937+
'a': [1, 2, 3],
3938+
'b': [1.0, 2.0, 3.0],
3939+
'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
3940+
'd': ['a', 'b', 'c']})
3941+
start_dataframe.iloc[0] = None
3942+
3943+
expected_dataframe = DataFrame({
3944+
'a': [np.nan, 2, 3],
3945+
'b': [np.nan, 2.0, 3.0],
3946+
'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
3947+
'd': [None, 'b', 'c']})
3948+
3949+
for column in expected_dataframe.columns:
3950+
assert_attr_equal('dtype', start_dataframe[column], expected_dataframe[column])
3951+
self.assert_numpy_array_equivalent(
3952+
start_dataframe[column].values,
3953+
expected_dataframe[column].values, strict_nan=True)
3954+
38193955

38203956
if __name__ == '__main__':
38213957
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/util/testing.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def round_trip_pickle(self, obj, path=None):
105105
pd.to_pickle(obj, path)
106106
return pd.read_pickle(path)
107107

108-
def assert_numpy_array_equivalent(self, np_array, assert_equal):
108+
def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False):
109109
"""Checks that 'np_array' is equivalent to 'assert_equal'
110110
111111
Two numpy arrays are equivalent if the arrays have equal non-NaN elements, and
@@ -115,7 +115,7 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal):
115115
similar to `assert_numpy_array_equal()`. If the expected array includes `np.nan` use this
116116
function.
117117
"""
118-
if array_equivalent(np_array, assert_equal):
118+
if array_equivalent(np_array, assert_equal, strict_nan=strict_nan):
119119
return
120120
raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))
121121

0 commit comments

Comments
 (0)