diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index d3024daaa59c9..69afd861df325 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -105,6 +105,34 @@ pandas objects provide intercompatibility between ``NaT`` and ``NaN``. df2 df2.get_dtype_counts() +.. _missing.inserting: + +Inserting missing data +---------------------- + +You can insert missing values by simply assigning to containers. The +actual missing value used will be chosen based on the dtype. + +For example, numeric containers will always use ``NaN`` regardless of +the missing value type chosen: + +.. ipython:: python + + s = Series([1, 2, 3]) + s.loc[0] = None + s + +Likewise, datetime containers will always use ``NaT``. + +For object containers, pandas will use the value given: + +.. ipython:: python + + s = Series(["a", "b", "c"]) + s.loc[0] = None + s.loc[1] = np.nan + s + Calculations with missing data ------------------------------ diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 4bd55b2172013..a0371f84a5649 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -232,6 +232,31 @@ API changes idx.duplicated() idx.drop_duplicates() +- Assigning values to ``None`` now considers the dtype when choosing an 'empty' value (:issue:`7941`). + + Previously, assigning to ``None`` in numeric containers changed the + dtype to object (or errored, depending on the call). It now uses + NaN: + + .. ipython:: python + + s = Series([1, 2, 3]) + s.loc[0] = None + s + + ``NaT`` is now used similarly for datetime containers. + + For object containers, we now preserve None values (previously these + were converted to NaN values). + + .. ipython:: python + + s = Series(["a", "b", "c"]) + s.loc[0] = None + s + + To insert a NaN, you must explicitly use ``np.nan``. See the :ref:`docs `. + .. _whatsnew_0150.dt: .dt accessor diff --git a/pandas/core/common.py b/pandas/core/common.py index 48fb75f59ac34..36f89a81836ae 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -368,7 +368,7 @@ def _is_null_datelike_scalar(other): return isnull(other) return False -def array_equivalent(left, right): +def array_equivalent(left, right, strict_nan=False): """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right @@ -379,6 +379,8 @@ def array_equivalent(left, right): Parameters ---------- left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. Returns ------- @@ -394,11 +396,32 @@ def array_equivalent(left, right): """ left, right = np.asarray(left), np.asarray(right) if left.shape != right.shape: return False - # NaNs occur only in object arrays, float or complex arrays. + + # Object arrays can contain None, NaN and NaT. if issubclass(left.dtype.type, np.object_): - return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() + + if not strict_nan: + # pd.isnull considers NaN and None to be equivalent. + return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all() + + for left_value, right_value in zip(left, right): + if left_value is tslib.NaT and right_value is not tslib.NaT: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + if left_value != right_value: + return False + + return True + + # NaNs can occur in float and complex arrays. if issubclass(left.dtype.type, (np.floating, np.complexfloating)): return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + # NaNs cannot occur otherwise. return np.array_equal(left, right) def _iterable_not_string(x): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f3b8a54034d56..0d61475905e75 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -494,6 +494,11 @@ def setitem(self, indexer, value): compatible shape """ + # coerce None values, if appropriate + if value is None: + if self.is_numeric: + value = np.nan + # coerce args values, value = self._try_coerce_args(self.values, value) arr_value = np.array(value) @@ -587,7 +592,7 @@ def putmask(self, mask, new, align=True, inplace=False): mask = mask.values.T # if we are passed a scalar None, convert it here - if not is_list_like(new) and isnull(new): + if not is_list_like(new) and isnull(new) and not self.is_object: new = self.fill_value if self._can_hold_element(new): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index e7bb716de60f3..967f437fc5ca1 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2,8 +2,10 @@ import nose import itertools import warnings +from datetime import datetime from pandas.compat import range, lrange, lzip, StringIO, lmap, map +from pandas.tslib import NaT from numpy import nan from numpy.random import randn import numpy as np @@ -14,7 +16,8 @@ from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, MultiIndex, Float64Index, Timestamp) from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal) + assert_frame_equal, assert_panel_equal, + assert_attr_equal) from pandas import concat import pandas.util.testing as tm @@ -3816,6 +3819,139 @@ def test_float_index_non_scalar_assignment(self): tm.assert_frame_equal(df,df2) +class TestSeriesNoneCoercion(tm.TestCase): + EXPECTED_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + + # For datetime series, we should coerce to NaT. + ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), + + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[0] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_loc_setitem(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[0] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_setitem_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + def test_coercion_with_loc_and_series(self): + for start_data, expected_result in self.EXPECTED_RESULTS: + start_series = Series(start_data) + start_series.loc[start_series == start_series[0]] = None + + expected_series = Series(expected_result) + + assert_attr_equal('dtype', start_series, expected_series) + self.assert_numpy_array_equivalent( + start_series.values, + expected_series.values, strict_nan=True) + + +class TestDataframeNoneCoercion(tm.TestCase): + EXPECTED_SINGLE_ROW_RESULTS = [ + # For numeric series, we should coerce to NaN. + ([1, 2, 3], [np.nan, 2, 3]), + ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]), + + # For datetime series, we should coerce to NaT. + ([datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)]), + + # For objects, we should preserve the None value. + (["foo", "bar", "baz"], [None, "bar", "baz"]), + ] + + def test_coercion_with_loc(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe.loc[0, ['foo']] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_coercion_with_setitem_and_dataframe(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe[start_dataframe['foo'] == start_dataframe['foo'][0]] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_none_coercion_loc_and_dataframe(self): + for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: + start_dataframe = DataFrame({'foo': start_data}) + start_dataframe.loc[start_dataframe['foo'] == start_dataframe['foo'][0]] = None + + expected_dataframe = DataFrame({'foo': expected_result}) + + assert_attr_equal('dtype', start_dataframe['foo'], expected_dataframe['foo']) + self.assert_numpy_array_equivalent( + start_dataframe['foo'].values, + expected_dataframe['foo'].values, strict_nan=True) + + def test_none_coercion_mixed_dtypes(self): + start_dataframe = DataFrame({ + 'a': [1, 2, 3], + 'b': [1.0, 2.0, 3.0], + 'c': [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + 'd': ['a', 'b', 'c']}) + start_dataframe.iloc[0] = None + + expected_dataframe = DataFrame({ + 'a': [np.nan, 2, 3], + 'b': [np.nan, 2.0, 3.0], + 'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], + 'd': [None, 'b', 'c']}) + + for column in expected_dataframe.columns: + assert_attr_equal('dtype', start_dataframe[column], expected_dataframe[column]) + self.assert_numpy_array_equivalent( + start_dataframe[column].values, + expected_dataframe[column].values, strict_nan=True) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a59994970009f..ef9d7d1566ec2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -105,7 +105,7 @@ def round_trip_pickle(self, obj, path=None): pd.to_pickle(obj, path) return pd.read_pickle(path) - def assert_numpy_array_equivalent(self, np_array, assert_equal): + def assert_numpy_array_equivalent(self, np_array, assert_equal, strict_nan=False): """Checks that 'np_array' is equivalent to 'assert_equal' Two numpy arrays are equivalent if the arrays have equal non-NaN elements, and @@ -115,7 +115,7 @@ def assert_numpy_array_equivalent(self, np_array, assert_equal): similar to `assert_numpy_array_equal()`. If the expected array includes `np.nan` use this function. """ - if array_equivalent(np_array, assert_equal): + if array_equivalent(np_array, assert_equal, strict_nan=strict_nan): return raise AssertionError('{0} is not equivalent to {1}.'.format(np_array, assert_equal))