From 3c345a11417cf9542460f027fa53dab616b1afbb Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 10 Feb 2013 21:01:47 -0500 Subject: [PATCH 1/9] BUG: fixup GH #2751; make sure that we cast to platform numeric when a list is specified; use the Series codepath for initial list conversion (change from using DataFrame) TST: added test for overflow in df creation --- pandas/core/common.py | 16 ++++++++++++--- pandas/core/frame.py | 20 ++++++++++++++----- pandas/core/series.py | 12 +++++++---- pandas/tests/test_frame.py | 41 +++++++++++++++++++++++++++++++++----- 4 files changed, 72 insertions(+), 17 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 7535ed68722fb..b791fa4f6c5e6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -805,10 +805,11 @@ def _consensus_name_attr(objs): # Lots of little utilities -def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, convert_platform=False): """ if we have an object dtype, try to coerce dates and/or numers """ - if values.dtype == np.object_ and convert_dates: + # convert dates + if convert_dates and getattr(values,'dtype',None) == np.object_: # we take an aggressive stance and convert to datetime64[ns] if convert_dates == 'coerce': @@ -821,7 +822,8 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): else: values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) - if values.dtype == np.object_ and convert_numeric: + # convert to numeric + if convert_numeric and getattr(values,'dtype',None) == np.object_: try: new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True) @@ -832,6 +834,14 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): except: pass + # platform conversion + # allow ndarray or list here + if convert_platform: + if isinstance(values, (list,tuple)): + values = lib.list_to_object_array(values) + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + return values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ecf2f8ba482f6..ebf4fe39bec9f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5460,11 +5460,21 @@ def _prep_ndarray(values, copy=True): if len(values) == 0: return np.empty((0, 0), dtype=object) - arr = np.asarray(values) - # NumPy strings are a pain, convert to object - if issubclass(arr.dtype.type, basestring): - arr = np.array(values, dtype=object, copy=True) - values = arr + def convert(v): + return com._possibly_convert_objects(v, + convert_dates=False, + convert_numeric=False, + convert_platform=True) + + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + if com.is_list_like(values[0]) or hasattr(values[0],'len'): + values = np.array([ convert(v) for v in values]) + else: + values = convert(values) + else: # drop subclass info, do not copy data values = np.asarray(values) diff --git a/pandas/core/series.py b/pandas/core/series.py index bb154896651cd..14e3dacb54b25 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3111,11 +3111,15 @@ def _try_cast(arr): raise subarr = pa.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) - subarr = com._possibly_cast_to_datetime(subarr, dtype) + else: - subarr = lib.list_to_object_array(data) - subarr = lib.maybe_convert_objects(subarr) - subarr = com._possibly_cast_to_datetime(subarr, dtype) + subarr = com._possibly_convert_objects(data, + convert_dates=False, + convert_numeric=False, + convert_platform=True) + + subarr = com._possibly_cast_to_datetime(subarr, dtype) + else: subarr = _try_cast(data) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c628bf3f0df97..9b46c6eac42bf 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8133,12 +8133,43 @@ def test_constructor_with_datetimes(self): expected.sort() assert_series_equal(result, expected) - # GH #2751 (construction with no index specified) - df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)] }) + def test_constructor_for_list_with_dtypes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # test list of lists/ndarrays + df = DataFrame([np.arange(5) for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int64' : 5}) + + df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int32' : 5}) + + # overflow issue? (we always expecte int64 upcasting here) + df = DataFrame({'a' : [2**31,2**31+1]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1 }) + assert_series_equal(result, expected) + + # GH #2751 (construction with no index specified), make sure we cast to platform values + df = DataFrame([1, 2]) + result = df.get_dtype_counts() + expected = Series({'int64': 1 }) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1, 2]}) + result = df.get_dtype_counts() + expected = Series({'int64': 1 }) + assert_series_equal(result, expected) + + df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], + 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)], + 'e' : [1.,2,4.,7]}) result = df.get_dtype_counts() - # TODO: fix this on 32-bit (or decide it's ok behavior?) - # expected = Series({intname: 1, floatname : 1, datetime64name: 1, objectname : 1}) - expected = Series({'int64': 1, floatname : 1, datetime64name: 1, objectname : 1}) + expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1}) result.sort() expected.sort() assert_series_equal(result, expected) From 37bb22a657be03e98032999eb29ea8774bc4fb30 Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 12 Feb 2013 22:48:15 -0500 Subject: [PATCH 2/9] DOC: RELEASE and whatsnew updated for DataFrame from lists change --- RELEASE.rst | 3 +++ doc/source/v0.11.0.txt | 50 ++++++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index e98849123c46c..11b047e4fbb88 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -53,6 +53,9 @@ pandas 0.11.0 - Do not automatically upcast numeric specified dtypes to ``int64`` or ``float64`` (GH622_ and GH797_) + - DataFrame construction of lists will no longer be platform dependent when + dtype is NOT specified, e.g. DataFrame([1,2]) will be ``int64`` + like DataFrame({'a' : [1,2]}) - Guarantee that ``convert_objects()`` for Series/DataFrame always returns a copy - groupby operations will respect dtypes for numeric float operations diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index d2648cbdb5a44..a03cfcbad11bb 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -3,7 +3,7 @@ v0.11.0 (March ??, 2013) ------------------------ -This is a minor release from 0.10.1 and includes many new features and +This is a major release from 0.10.1 and includes many new features and enhancements along with a large number of bug fixes. There are also a number of important API changes that long-time pandas users should pay close attention to. @@ -54,6 +54,18 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe Timestamp('20010104'), '20010105'],dtype='O') s.convert_objects(convert_dates='coerce') + +**Platform Gotchas** + +In versions prior to 0.11.0, DataFrame construction with lists was platform dependent (meaning 32-bit vs 64-bit). +``DataFrame([1,2],columns=['a'])`` would have a dtype of ``int32``, +while ``DataFrame({'a' : [1,2] })`` would be ``int64``. +Now construction dtype defaults will be handled in a platform independent manor, +resulting in defaults for integers of ``int64`` and floats of ``float64`` dtypes. + +Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! + + **Upcasting Gotchas** Performing indexing operations on integer type data can easily upcast the data. @@ -82,21 +94,11 @@ While float dtypes are unchanged. casted casted.dtypes -New features -~~~~~~~~~~~~ - -**Enhancements** - - - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) - -**Bug Fixes** - -See the `full release notes -`__ or issue tracker -on GitHub for a complete list. - +**Datetimes conversion** -Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. Furthermore datetime64 columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) +Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, +in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. +Furthermore ``datetime64[ns]`` columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) .. ipython:: python @@ -111,8 +113,7 @@ Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` df.ix[2:4,['A','timestamp']] = np.nan df -Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``np.nan`` - +Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`` to ``np.nan`` .. ipython:: python @@ -127,6 +128,13 @@ Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``n s.dtype +New features +~~~~~~~~~~~~ + +**Enhancements** + + - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) + ``Squeeze`` to possibly remove length 1 dimensions from an object. .. ipython:: python @@ -137,3 +145,11 @@ Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``n p p.reindex(items=['ItemA']).squeeze() p.reindex(items=['ItemA'],minor=['B']).squeeze() + +**Bug Fixes** + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + + From 6cdea33d0b384ecb8bdc694fd8388618d32bca63 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 08:32:02 -0500 Subject: [PATCH 3/9] CLN: cleaned up _possibly_convert_platform --- RELEASE.rst | 6 +- doc/source/v0.11.0.txt | 16 +- pandas/core/common.py | 46 ++++-- pandas/core/frame.py | 24 +-- pandas/core/series.py | 15 +- pandas/tests/test_frame.py | 275 ++++++++++++++++++------------- pandas/tests/test_panel.py | 2 +- pandas/tests/test_panel4d.py | 2 +- pandas/tools/tests/test_merge.py | 2 +- 9 files changed, 228 insertions(+), 160 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 11b047e4fbb88..25350555317bd 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -53,9 +53,9 @@ pandas 0.11.0 - Do not automatically upcast numeric specified dtypes to ``int64`` or ``float64`` (GH622_ and GH797_) - - DataFrame construction of lists will no longer be platform dependent when - dtype is NOT specified, e.g. DataFrame([1,2]) will be ``int64`` - like DataFrame({'a' : [1,2]}) + - DataFrame construction of lists and scalars, with no dtype present, will + result in casting to ``int64`` or ``float64``, regardless of platform. + This is not an apparent change in the API, but noting it. - Guarantee that ``convert_objects()`` for Series/DataFrame always returns a copy - groupby operations will respect dtypes for numeric float operations diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index a03cfcbad11bb..fd78ba7cfb283 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -57,11 +57,17 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe **Platform Gotchas** -In versions prior to 0.11.0, DataFrame construction with lists was platform dependent (meaning 32-bit vs 64-bit). -``DataFrame([1,2],columns=['a'])`` would have a dtype of ``int32``, -while ``DataFrame({'a' : [1,2] })`` would be ``int64``. -Now construction dtype defaults will be handled in a platform independent manor, -resulting in defaults for integers of ``int64`` and floats of ``float64`` dtypes. +Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``, +*regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify +dtypes, they *WILL* be respected, however. + +The following will all result in ``int64`` dtypes + +.. ipython:: python + + DataFrame([1,2],columns=['a']).dtypes + DataFrame({'a' : [1,2] }.dtypes + DataFrame({'a' : 1).dtypes Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! diff --git a/pandas/core/common.py b/pandas/core/common.py index b791fa4f6c5e6..51e14b05495db 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -643,6 +643,21 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, take_f(arr, indexer, out=out, fill_value=fill_value) return out +def _dtype_from_scalar(val): + """ interpret the dtype from a scalar, upcast floats and ints """ + if isinstance(val, np.datetime64): + # ugly hacklet + val = lib.Timestamp(val).value + return val, np.dtype('M8[ns]') + + # provide implicity upcast on scalars + elif is_integer(val): + if not is_bool(val): + return val, np.int64 + elif is_float(val): + return val, np.float64 + + return val, type(val) def _maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.datetime64): @@ -654,7 +669,7 @@ def _maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.bool_): return np.object_ elif issubclass(dtype.type, np.integer): - return np.float_ + return np.float64 return dtype elif is_bool(fill_value): if issubclass(dtype.type, np.bool_): @@ -682,7 +697,7 @@ def _maybe_promote(dtype, fill_value=np.nan): def _maybe_upcast(values): # TODO: convert remaining usage of _maybe_upcast to _maybe_promote if issubclass(values.dtype.type, np.integer): - values = values.astype(np.float_) + values = values.astype(np.float64) elif issubclass(values.dtype.type, np.bool_): values = values.astype(np.object_) return values @@ -805,11 +820,11 @@ def _consensus_name_attr(objs): # Lots of little utilities -def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, convert_platform=False): +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): """ if we have an object dtype, try to coerce dates and/or numers """ # convert dates - if convert_dates and getattr(values,'dtype',None) == np.object_: + if convert_dates and values.dtype == np.object_: # we take an aggressive stance and convert to datetime64[ns] if convert_dates == 'coerce': @@ -823,7 +838,7 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) # convert to numeric - if convert_numeric and getattr(values,'dtype',None) == np.object_: + if convert_numeric and values.dtype == np.object_: try: new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True) @@ -834,13 +849,15 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, except: pass - # platform conversion - # allow ndarray or list here - if convert_platform: - if isinstance(values, (list,tuple)): - values = lib.list_to_object_array(values) - if values.dtype == np.object_: - values = lib.maybe_convert_objects(values) + return values + +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list,tuple)): + values = lib.list_to_object_array(values) + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) return values @@ -887,12 +904,13 @@ def _possibly_cast_to_datetime(value, dtype, coerce = False): def _infer_dtype(value): + # provide upcasting here for floats/ints if isinstance(value, (float, np.floating)): - return np.float_ + return np.float64 elif isinstance(value, (bool, np.bool_)): return np.bool_ elif isinstance(value, (int, long, np.integer)): - return np.int_ + return np.int64 elif isinstance(value, (complex, np.complexfloating)): return np.complex_ else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ebf4fe39bec9f..d60ae4477af86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,13 +23,13 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _is_sequence) + _default_index, _is_sequence, _dtype_from_scalar) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, _is_index_slice, _check_bool_indexer) from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.series import Series, _radd_compat, _dtype_from_scalar +from pandas.core.series import Series, _radd_compat from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -2207,6 +2207,9 @@ def _sanitize_column(self, key, value): if key in self.columns: existing_piece = self[key] + # upcast the scalar + value, dtype = _dtype_from_scalar(value) + # transpose hack if isinstance(existing_piece, DataFrame): shape = (len(existing_piece.columns), len(self.index)) @@ -2214,14 +2217,19 @@ def _sanitize_column(self, key, value): else: value = np.repeat(value, len(self.index)) - # special case for now + # special case for now (promotion) if (com.is_float_dtype(existing_piece) and com.is_integer_dtype(value)): - value = value.astype(np.float64) + dtype = np.float64 + + value = value.astype(dtype) else: - value = np.repeat(value, len(self.index)) + # upcast the scalar + value, dtype = _dtype_from_scalar(value) + value = np.array(np.repeat(value, len(self.index)), dtype=dtype) + value = com._possibly_cast_to_datetime(value, dtype) return np.atleast_2d(np.asarray(value)) def pop(self, item): @@ -5461,11 +5469,7 @@ def _prep_ndarray(values, copy=True): return np.empty((0, 0), dtype=object) def convert(v): - return com._possibly_convert_objects(v, - convert_dates=False, - convert_numeric=False, - convert_platform=True) - + return com._possibly_convert_platform(v) # we could have a 1-dim or 2-dim list here # this is equiv of np.asarray, but does object conversion diff --git a/pandas/core/series.py b/pandas/core/series.py index 14e3dacb54b25..64b90fddfe832 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3113,10 +3113,7 @@ def _try_cast(arr): subarr = lib.maybe_convert_objects(subarr) else: - subarr = com._possibly_convert_objects(data, - convert_dates=False, - convert_numeric=False, - convert_platform=True) + subarr = com._possibly_convert_platform(data) subarr = com._possibly_cast_to_datetime(subarr, dtype) @@ -3145,7 +3142,7 @@ def _try_cast(arr): dtype = value.dtype value = value.item() else: - value, dtype = _dtype_from_scalar(value) + value, dtype = com._dtype_from_scalar(value) subarr = pa.empty(len(index), dtype=dtype) else: @@ -3180,14 +3177,6 @@ def _try_cast(arr): return subarr -def _dtype_from_scalar(val): - if isinstance(val, np.datetime64): - # ugly hacklet - val = lib.Timestamp(val).value - return val, np.dtype('M8[ns]') - return val, type(val) - - def _get_rename_function(mapper): if isinstance(mapper, (dict, Series)): def f(x): diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9b46c6eac42bf..424d6e2e6e5ba 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -372,6 +372,11 @@ def test_setitem(self): self.assertEqual(smaller['col10'].dtype, np.object_) self.assert_((smaller['col10'] == ['1', '2']).all()) + # with a dtype + for dtype in ['int32','int64','float32','float64']: + self.frame[dtype] = np.array(arr,dtype=dtype) + self.assert_(self.frame[dtype].dtype.name == dtype) + def test_setitem_tuple(self): self.frame['A', 'B'] = self.frame['A'] assert_series_equal(self.frame['A', 'B'], self.frame['A']) @@ -437,7 +442,7 @@ def test_setitem_cast(self): # #669, should not cast? self.frame['B'] = 0 - self.assert_(self.frame['B'].dtype == np.float_) + self.assert_(self.frame['B'].dtype == np.float64) # cast if pass array of course self.frame['B'] = np.arange(len(self.frame)) @@ -445,18 +450,18 @@ def test_setitem_cast(self): self.frame['foo'] = 'bar' self.frame['foo'] = 0 - self.assert_(self.frame['foo'].dtype == np.int_) + self.assert_(self.frame['foo'].dtype == np.int64) self.frame['foo'] = 'bar' self.frame['foo'] = 2.5 - self.assert_(self.frame['foo'].dtype == np.float_) + self.assert_(self.frame['foo'].dtype == np.float64) self.frame['something'] = 0 - self.assert_(self.frame['something'].dtype == np.int_) + self.assert_(self.frame['something'].dtype == np.int64) self.frame['something'] = 2 - self.assert_(self.frame['something'].dtype == np.int_) + self.assert_(self.frame['something'].dtype == np.int64) self.frame['something'] = 2.5 - self.assert_(self.frame['something'].dtype == np.float_) + self.assert_(self.frame['something'].dtype == np.float64) def test_setitem_boolean_column(self): expected = self.frame.copy() @@ -490,8 +495,12 @@ def test_setitem_corner(self): self.assertEqual(len(dm.columns), 2) self.assertEqual(dm.values.dtype, np.object_) + # upcast dm['C'] = 1 - self.assertEqual(dm['C'].dtype, np.int_) + self.assertEqual(dm['C'].dtype, np.int64) + + dm['E'] = 1. + self.assertEqual(dm['E'].dtype, np.float64) # set existing column dm['A'] = 'bar' @@ -2369,9 +2378,9 @@ def test_constructor_scalar_inference(self): 'float': 3., 'complex': 4j, 'object': 'foo'} df = DataFrame(data, index=np.arange(10)) - self.assert_(df['int'].dtype == np.int_) + self.assert_(df['int'].dtype == np.int64) self.assert_(df['bool'].dtype == np.bool_) - self.assert_(df['float'].dtype == np.float_) + self.assert_(df['float'].dtype == np.float64) self.assert_(df['complex'].dtype == np.complex128) self.assert_(df['object'].dtype == np.object_) @@ -2689,9 +2698,16 @@ def test_constructor_column_duplicates(self): columns=['b', 'a', 'a']) def test_constructor_single_value(self): + + # expecting single value upcasting here df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape), df.index, + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('float64'), df.index, + df.columns)) + + df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c']) + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), df.index, df.columns)) + df = DataFrame('a', index=[1, 2], columns=['a', 'c']) assert_frame_equal(df, DataFrame(np.array([['a', 'a'], @@ -2705,6 +2721,136 @@ def test_constructor_single_value(self): self.assertRaises( com.PandasError, DataFrame, 'a', [1, 2], ['a', 'c'], float) + + def test_constructor_with_datetimes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # single item + df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, + index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, datetime64name: 2, objectname : 2}) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array(1.,dtype=floatname), + intname : np.array(1,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = { objectname : 1 } + if intname == 'int64': + expected['int64'] = 2 + else: + expected['int64'] = 1 + expected[intname] = 1 + if floatname == 'float64': + expected['float64'] = 2 + else: + expected['float64'] = 1 + expected[floatname] = 1 + + result.sort() + expected = Series(expected) + expected.sort() + assert_series_equal(result, expected) + + # check with ndarray construction ndim>0 + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array([1.]*10,dtype=floatname), + intname : np.array([1]*10,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + result.sort() + assert_series_equal(result, expected) + + # GH 2809 + from pandas import date_range + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + datetime_s = Series(datetimes) + self.assert_(datetime_s.dtype == 'M8[ns]') + df = DataFrame({'datetime_s':datetime_s}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1 }) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + # GH 2810 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + dates = [ts.date() for ts in ind] + df = DataFrame({'datetimes': datetimes, 'dates':dates}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1, objectname : 1 }) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + def test_constructor_for_list_with_dtypes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # test list of lists/ndarrays + df = DataFrame([np.arange(5) for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int64' : 5}) + + df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int32' : 5}) + + # overflow issue? (we always expecte int64 upcasting here) + df = DataFrame({'a' : [2**31,2**31+1]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1 }) + assert_series_equal(result, expected) + + # GH #2751 (construction with no index specified), make sure we cast to platform values + df = DataFrame([1, 2]) + result = df.get_dtype_counts() + expected = Series({'int64': 1 }) + assert_series_equal(result, expected) + + df = DataFrame([1.,2.]) + result = df.get_dtype_counts() + expected = Series({'float64' : 1 }) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1, 2]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1., 2.]}) + result = df.get_dtype_counts() + expected = Series({'float64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1 }, index=range(3)) + result = df.get_dtype_counts() + expected = Series({'int64': 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1. }, index=range(3)) + result = df.get_dtype_counts() + expected = Series({'float64': 1 }) + assert_series_equal(result, expected) + + # with object list + df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], + 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)], + 'e' : [1.,2,4.,7]}) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1}) + result.sort() + expected.sort() + assert_series_equal(result, expected) + def test_new_empty_index(self): df1 = DataFrame(randn(0, 3)) df2 = DataFrame(randn(0, 3)) @@ -2757,8 +2903,12 @@ def _check_cast(df, v): casted = mn.astype('float32') _check_cast(casted, 'float32') - casted = mn.astype('int32') - _check_cast(casted, 'int32') + # this is platform dependent overflow + if np.int_ == np.int32: + self.assertRaises(OverflowError, mn.astype, 'int32') + else: + casted = mn.astype('int32') + _check_cast(casted, 'int32') # to object casted = mn.astype('O') @@ -7156,7 +7306,7 @@ def test_get_numeric_data(self): df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f' : Timestamp('20010102')}, index=np.arange(10)) result = df.get_dtype_counts() - expected = Series({intname: 1, floatname : 1, datetime64name: 1, objectname : 1}) + expected = Series({'int64': 1, 'float64' : 1, datetime64name: 1, objectname : 1}) result.sort() expected.sort() assert_series_equal(result, expected) @@ -8099,105 +8249,6 @@ def test_as_matrix_lcd(self): values = self.mixed_int.as_matrix(['C']) self.assert_(values.dtype == np.uint8) - - def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name - floatname = np.dtype(np.float_).name - datetime64name = np.dtype('M8[ns]').name - objectname = np.dtype(np.object_).name - - # single item - df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, - index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 1, datetime64name: 2, objectname : 2}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array(1.,dtype=floatname), - intname : np.array(1,dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 2, floatname : 2, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # check with ndarray construction ndim>0 - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array([1.]*10,dtype=floatname), - intname : np.array([1]*10,dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 2, floatname : 2, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - def test_constructor_for_list_with_dtypes(self): - intname = np.dtype(np.int_).name - floatname = np.dtype(np.float_).name - datetime64name = np.dtype('M8[ns]').name - objectname = np.dtype(np.object_).name - - # test list of lists/ndarrays - df = DataFrame([np.arange(5) for x in range(5)]) - result = df.get_dtype_counts() - expected = Series({'int64' : 5}) - - df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)]) - result = df.get_dtype_counts() - expected = Series({'int32' : 5}) - - # overflow issue? (we always expecte int64 upcasting here) - df = DataFrame({'a' : [2**31,2**31+1]}) - result = df.get_dtype_counts() - expected = Series({'int64' : 1 }) - assert_series_equal(result, expected) - - # GH #2751 (construction with no index specified), make sure we cast to platform values - df = DataFrame([1, 2]) - result = df.get_dtype_counts() - expected = Series({'int64': 1 }) - assert_series_equal(result, expected) - - df = DataFrame({'a' : [1, 2]}) - result = df.get_dtype_counts() - expected = Series({'int64': 1 }) - assert_series_equal(result, expected) - - df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], - 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)], - 'e' : [1.,2,4.,7]}) - result = df.get_dtype_counts() - expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # GH 2809 - from pandas import date_range - ind = date_range(start="2000-01-01", freq="D", periods=10) - datetimes = [ts.to_pydatetime() for ts in ind] - datetime_s = Series(datetimes) - self.assert_(datetime_s.dtype == 'M8[ns]') - df = DataFrame({'datetime_s':datetime_s}) - result = df.get_dtype_counts() - expected = Series({ datetime64name : 1 }) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # GH 2810 - ind = date_range(start="2000-01-01", freq="D", periods=10) - datetimes = [ts.to_pydatetime() for ts in ind] - dates = [ts.date() for ts in ind] - df = DataFrame({'datetimes': datetimes, 'dates':dates}) - result = df.get_dtype_counts() - expected = Series({ datetime64name : 1, objectname : 1 }) - result.sort() - expected.sort() - assert_series_equal(result, expected) - def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 07a02f18d8337..da7a0f68b3eb4 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -418,7 +418,7 @@ def test_setitem(self): # scalar self.panel['ItemG'] = 1 self.panel['ItemE'] = True - self.assert_(self.panel['ItemG'].values.dtype == np.int_) + self.assert_(self.panel['ItemG'].values.dtype == np.int64) self.assert_(self.panel['ItemE'].values.dtype == np.bool_) # object dtype diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 87bfba7c55cce..5bb452deb1d4d 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -358,7 +358,7 @@ def test_setitem(self): # scalar self.panel4d['lG'] = 1 self.panel4d['lE'] = True - self.assert_(self.panel4d['lG'].values.dtype == np.int_) + self.assert_(self.panel4d['lG'].values.dtype == np.int64) self.assert_(self.panel4d['lE'].values.dtype == np.bool_) # object dtype diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 8820d43975885..d1c4710c16aad 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -287,7 +287,7 @@ def test_join_index_mixed(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int) + self.assert_(df1['B'].dtype == np.int64) self.assert_(df1['D'].dtype == np.bool_) df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, From 43a01025ca7d639de99427827f39b1a80949e761 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 16:57:20 -0500 Subject: [PATCH 4/9] CLN: moved some functionality from series._sanitize to com._dtype_from_scalar --- pandas/core/common.py | 17 ++++++++++++++++- pandas/core/series.py | 23 +++++------------------ 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 51e14b05495db..0f71735af3dbf 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,6 +24,7 @@ from pandas.util.py3compat import StringIO, BytesIO from pandas.core.config import get_option +from pandas.core import array as pa # XXX: HACK for NumPy 1.5.1 to suppress warnings try: @@ -645,7 +646,21 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, def _dtype_from_scalar(val): """ interpret the dtype from a scalar, upcast floats and ints """ - if isinstance(val, np.datetime64): + + # a 1-element ndarray + if isinstance(val, pa.Array): + return val.item(), val.dtype + + elif isinstance(val, basestring): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + return val, np.object_ + + elif isinstance(val, np.datetime64): # ugly hacklet val = lib.Timestamp(val).value return val, np.dtype('M8[ns]') diff --git a/pandas/core/series.py b/pandas/core/series.py index 64b90fddfe832..5405637ff7382 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3127,29 +3127,16 @@ def _try_cast(arr): elif index is not None: value = data - # If we create an empty array using a string to infer - # the dtype, NumPy will only allocate one character per entry - # so this is kind of bad. Alternately we could use np.repeat - # instead of np.empty (but then you still don't want things - # coming out as np.str_! - if isinstance(value, basestring) and dtype is None: - dtype = np.object_ - + # figure out the dtype from the value (upcast if necessary) if dtype is None: - - # a 1-element ndarray - if isinstance(value, pa.Array): - dtype = value.dtype - value = value.item() - else: - value, dtype = com._dtype_from_scalar(value) - - subarr = pa.empty(len(index), dtype=dtype) + value, dtype = com._dtype_from_scalar(value) else: # need to possibly convert the value here value = com._possibly_cast_to_datetime(value, dtype) - subarr = pa.empty(len(index), dtype=dtype) + + subarr = pa.empty(len(index), dtype=dtype) subarr.fill(value) + else: return subarr.item() From ac3cdab926ae7e944720aa8e20622f4fad50a1b8 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 17:24:34 -0500 Subject: [PATCH 5/9] DOC: whatsnew updates --- doc/source/v0.11.0.txt | 36 +++++++++++++++++++++++------------- pandas/tests/test_frame.py | 8 ++------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index fd78ba7cfb283..0162ee85ac518 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -13,7 +13,8 @@ API changes Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. -**Dtype Specification** +Dtype Specification +~~~~~~~~~~~~~~~~~~~ .. ipython:: python @@ -29,7 +30,8 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe df3 df3.dtypes -**Dtype conversion** +Dtype Conversion +~~~~~~~~~~~~~~~~ .. ipython:: python @@ -54,20 +56,22 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe Timestamp('20010104'), '20010105'],dtype='O') s.convert_objects(convert_dates='coerce') +Dtype Gotchas +~~~~~~~~~~~~~ **Platform Gotchas** Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``, *regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify -dtypes, they *WILL* be respected, however. +dtypes, they *WILL* be respected, however (GH2837_) The following will all result in ``int64`` dtypes .. ipython:: python DataFrame([1,2],columns=['a']).dtypes - DataFrame({'a' : [1,2] }.dtypes - DataFrame({'a' : 1).dtypes + DataFrame({'a' : [1,2] }).dtypes + DataFrame({'a' : 1 }, index=range(2)).dtypes Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! @@ -100,11 +104,13 @@ While float dtypes are unchanged. casted casted.dtypes -**Datetimes conversion** +Datetimes Conversion +~~~~~~~~~~~~~~~~~~~~ Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. Furthermore ``datetime64[ns]`` columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) +(GH2809_, GH2810_) .. ipython:: python @@ -139,18 +145,19 @@ New features **Enhancements** - - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) + - In ``HDFStore``, provide dotted attribute access to ``get`` from stores + (e.g. store.df == store['df']) -``Squeeze`` to possibly remove length 1 dimensions from an object. + - ``Squeeze`` to possibly remove length 1 dimensions from an object. -.. ipython:: python + .. ipython:: python - p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], + p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], major_axis=date_range('20010102',periods=4), minor_axis=['A','B','C','D']) - p - p.reindex(items=['ItemA']).squeeze() - p.reindex(items=['ItemA'],minor=['B']).squeeze() + p + p.reindex(items=['ItemA']).squeeze() + p.reindex(items=['ItemA'],minor=['B']).squeeze() **Bug Fixes** @@ -158,4 +165,7 @@ See the `full release notes `__ or issue tracker on GitHub for a complete list. +.. _GH2809: https://github.com/pydata/pandas/issues/2809 +.. _GH2810: https://github.com/pydata/pandas/issues/2810 +.. _GH2837: https://github.com/pydata/pandas/issues/2837 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 424d6e2e6e5ba..24883b3359c42 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -2903,12 +2903,8 @@ def _check_cast(df, v): casted = mn.astype('float32') _check_cast(casted, 'float32') - # this is platform dependent overflow - if np.int_ == np.int32: - self.assertRaises(OverflowError, mn.astype, 'int32') - else: - casted = mn.astype('int32') - _check_cast(casted, 'int32') + casted = mn.astype('int32') + _check_cast(casted, 'int32') # to object casted = mn.astype('O') From 0e7c20e23fbe38c162e5d068e1474bca487c3173 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 18:36:01 -0500 Subject: [PATCH 6/9] CLN: in common.py merged _dtype_from_scalar and _infer_dtype yield _infer_dtype_from_scalar --- pandas/core/common.py | 31 +++++++++++++------------------ pandas/core/frame.py | 10 +++++----- pandas/core/internals.py | 2 +- pandas/core/panel.py | 8 ++++---- pandas/core/series.py | 5 +++-- 5 files changed, 26 insertions(+), 30 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 0f71735af3dbf..fb17eb8d98c83 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -644,11 +644,15 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, take_f(arr, indexer, out=out, fill_value=fill_value) return out -def _dtype_from_scalar(val): - """ interpret the dtype from a scalar, upcast floats and ints """ +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar, upcast floats and ints + return the new value and the dtype """ # a 1-element ndarray if isinstance(val, pa.Array): + if val.ndim != 0: + raise ValueError("invalid ndarray passed to _dtype_from_scalar") + return val.item(), val.dtype elif isinstance(val, basestring): @@ -665,14 +669,19 @@ def _dtype_from_scalar(val): val = lib.Timestamp(val).value return val, np.dtype('M8[ns]') + elif is_bool(val): + return val, np.bool_ + # provide implicity upcast on scalars elif is_integer(val): - if not is_bool(val): return val, np.int64 elif is_float(val): return val, np.float64 - return val, type(val) + elif is_complex(val): + return val, np.complex_ + + return val, np.object_ def _maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.datetime64): @@ -918,20 +927,6 @@ def _possibly_cast_to_datetime(value, dtype, coerce = False): return value -def _infer_dtype(value): - # provide upcasting here for floats/ints - if isinstance(value, (float, np.floating)): - return np.float64 - elif isinstance(value, (bool, np.bool_)): - return np.bool_ - elif isinstance(value, (int, long, np.integer)): - return np.int64 - elif isinstance(value, (complex, np.complexfloating)): - return np.complex_ - else: - return np.object_ - - def _possibly_cast_item(obj, item, dtype): chunk = obj[item] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d60ae4477af86..efb3520b152c0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,7 +23,7 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _is_sequence, _dtype_from_scalar) + _default_index, _is_sequence, _infer_dtype_from_scalar) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, @@ -437,7 +437,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if isinstance(data, basestring) and dtype is None: dtype = np.object_ if dtype is None: - data, dtype = _dtype_from_scalar(data) + data, dtype = _infer_dtype_from_scalar(data) values = np.empty((len(index), len(columns)), dtype=dtype) values.fill(data) @@ -1878,7 +1878,7 @@ def set_value(self, index, col, value): new_index, new_columns = self._expand_axes((index, col)) result = self.reindex(index=new_index, columns=new_columns, copy=False) - likely_dtype = com._infer_dtype(value) + value, likely_dtype = _infer_dtype_from_scalar(value) made_bigger = not np.array_equal(new_columns, self.columns) @@ -2208,7 +2208,7 @@ def _sanitize_column(self, key, value): existing_piece = self[key] # upcast the scalar - value, dtype = _dtype_from_scalar(value) + value, dtype = _infer_dtype_from_scalar(value) # transpose hack if isinstance(existing_piece, DataFrame): @@ -2226,7 +2226,7 @@ def _sanitize_column(self, key, value): else: # upcast the scalar - value, dtype = _dtype_from_scalar(value) + value, dtype = _infer_dtype_from_scalar(value) value = np.array(np.repeat(value, len(self.index)), dtype=dtype) value = com._possibly_cast_to_datetime(value, dtype) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ee024ce68b5b4..ddcf271dc0687 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1412,7 +1412,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan): block_shape = list(self.shape) block_shape[0] = len(items) - dtype = com._infer_dtype(fill_value) + fill_value, dtype = com._infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) na_block = make_block(block_values, items, ref_items) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 6b867f9a643db..6fea1cc85c728 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -7,7 +7,7 @@ import sys import numpy as np from pandas.core.common import (PandasError, _mut_exclusive, - _try_sort, _default_index, _infer_dtype, + _try_sort, _default_index, _infer_dtype_from_scalar, notnull) from pandas.core.categorical import Factor from pandas.core.index import (Index, MultiIndex, _ensure_index, @@ -657,8 +657,8 @@ def set_value(self, *args): axes = self._expand_axes(args) d = self._construct_axes_dict_from(self, axes, copy=False) result = self.reindex(**d) - - likely_dtype = com._infer_dtype(args[-1]) + args = list(args) + args[-1], likely_dtype = _infer_dtype_from_scalar(args[-1]) made_bigger = not np.array_equal( axes[0], getattr(self, self._info_axis)) # how to make this logic simpler? @@ -693,7 +693,7 @@ def __setitem__(self, key, value): assert(value.shape == shape[1:]) mat = np.asarray(value) elif np.isscalar(value): - dtype = _infer_dtype(value) + value, dtype = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 5405637ff7382..bb7dd934cb383 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -16,7 +16,8 @@ from pandas.core.common import (isnull, notnull, _is_bool_indexer, _default_index, _maybe_upcast, - _asarray_tuplesafe, is_integer_dtype) + _asarray_tuplesafe, is_integer_dtype, + _infer_dtype_from_scalar) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer @@ -3129,7 +3130,7 @@ def _try_cast(arr): # figure out the dtype from the value (upcast if necessary) if dtype is None: - value, dtype = com._dtype_from_scalar(value) + value, dtype = _infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = com._possibly_cast_to_datetime(value, dtype) From 3cb91f09c5a710c589d365e3dfea20501b09b978 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 13 Feb 2013 21:18:27 -0500 Subject: [PATCH 7/9] CLN: in common.py - revised _maybe_upcast to use _maybe_promote in rehashpe.py - removed block2d_to_block3d in favor of block2d_to_blocknd --- pandas/core/common.py | 32 ++++++++++++++---------------- pandas/core/frame.py | 6 +++--- pandas/core/reshape.py | 45 +++++------------------------------------- pandas/core/series.py | 5 ++--- pandas/io/pytables.py | 2 +- 5 files changed, 26 insertions(+), 64 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index fb17eb8d98c83..d9ab22758e76b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -651,7 +651,7 @@ def _infer_dtype_from_scalar(val): # a 1-element ndarray if isinstance(val, pa.Array): if val.ndim != 0: - raise ValueError("invalid ndarray passed to _dtype_from_scalar") + raise ValueError("invalid ndarray passed to _infer_dtype_from_scalar") return val.item(), val.dtype @@ -719,13 +719,21 @@ def _maybe_promote(dtype, fill_value=np.nan): def _maybe_upcast(values): - # TODO: convert remaining usage of _maybe_upcast to _maybe_promote - if issubclass(values.dtype.type, np.integer): - values = values.astype(np.float64) - elif issubclass(values.dtype.type, np.bool_): - values = values.astype(np.object_) + """ provide explicty type promotion and coercion """ + new_dtype = _maybe_promote(values.dtype) + if new_dtype != values.dtype: + values = values.astype(new_dtype) return values - + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + def _interp_wrapper(f, wrap_dtype, na_override=None): def wrapper(arr, mask, limit=None): @@ -927,16 +935,6 @@ def _possibly_cast_to_datetime(value, dtype, coerce = False): return value -def _possibly_cast_item(obj, item, dtype): - chunk = obj[item] - - if chunk.values.dtype != dtype: - if dtype in (np.object_, np.bool_): - obj[item] = chunk.astype(np.object_) - elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: %s" % dtype) - - def _is_bool_indexer(key): if isinstance(key, np.ndarray) and key.dtype == np.object_: key = np.asarray(key) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index efb3520b152c0..bee0e0e3094b4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1234,7 +1234,7 @@ def to_panel(self): panel : Panel """ from pandas.core.panel import Panel - from pandas.core.reshape import block2d_to_block3d + from pandas.core.reshape import block2d_to_blocknd # only support this kind for now if (not isinstance(self.index, MultiIndex) or @@ -1261,8 +1261,8 @@ def to_panel(self): new_blocks = [] for block in selfsorted._data.blocks: - newb = block2d_to_block3d(block.values.T, block.items, shape, - major_labels, minor_labels, + newb = block2d_to_blocknd(block.values.T, block.items, shape, + [ major_labels, minor_labels ], ref_items=selfsorted.columns) new_blocks.append(newb) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 362215703e1f2..32f98399bd6dd 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -9,7 +9,8 @@ from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical -from pandas.core.common import notnull, _ensure_platform_int +from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, + _maybe_upcast) from pandas.core.groupby import (get_group_index, _compress_group_index, decons_group_index) import pandas.core.common as com @@ -148,11 +149,9 @@ def get_new_values(self): stride = values.shape[1] result_width = width * stride - new_values = np.empty((length, result_width), dtype=values.dtype) - new_mask = np.zeros((length, result_width), dtype=bool) - - new_values = com._maybe_upcast(new_values) + new_values = np.empty((length, result_width), dtype=_maybe_promote(values.dtype)) new_values.fill(np.nan) + new_mask = np.zeros((length, result_width), dtype=bool) # is there a simpler / faster way of doing this? for i in xrange(values.shape[1]): @@ -761,40 +760,6 @@ def make_axis_dummies(frame, axis='minor', transform=None): return DataFrame(values, columns=items, index=frame.index) -def block2d_to_block3d(values, items, shape, major_labels, minor_labels, - ref_items=None): - """ - Developer method for pivoting DataFrame -> Panel. Used in HDFStore and - DataFrame.to_panel - """ - from pandas.core.internals import make_block - panel_shape = (len(items),) + shape - - # TODO: lexsort depth needs to be 2!! - - # Create observation selection vector using major and minor - # labels, for converting to panel format. - selector = minor_labels + shape[1] * major_labels - mask = np.zeros(np.prod(shape), dtype=bool) - mask.put(selector, True) - - pvalues = np.empty(panel_shape, dtype=values.dtype) - if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): - pvalues.fill(np.nan) - elif not mask.all(): - pvalues = com._maybe_upcast(pvalues) - pvalues.fill(np.nan) - - values = values - for i in xrange(len(items)): - pvalues[i].flat[mask] = values[:, i] - - if ref_items is None: - ref_items = items - - return make_block(pvalues, items, ref_items) - - def block2d_to_blocknd(values, items, shape, labels, ref_items=None): """ pivot to the labels shape """ from pandas.core.internals import make_block @@ -812,7 +777,7 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): pvalues.fill(np.nan) elif not mask.all(): - pvalues = com._maybe_upcast(pvalues) + pvalues = _maybe_upcast(pvalues) pvalues.fill(np.nan) values = values diff --git a/pandas/core/series.py b/pandas/core/series.py index bb7dd934cb383..e8af3963d0f45 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -15,7 +15,7 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, _is_bool_indexer, - _default_index, _maybe_upcast, + _default_index, _maybe_promote, _asarray_tuplesafe, is_integer_dtype, _infer_dtype_from_scalar) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -2818,8 +2818,7 @@ def _get_values(): return values if offset is None: - new_values = pa.empty(len(self), dtype=self.dtype) - new_values = _maybe_upcast(new_values) + new_values = pa.empty(len(self), dtype=_maybe_promote(self.dtype)) if periods > 0: new_values[periods:] = self.values[:-periods] diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 84c2ef4957529..b56b6c5e5923f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -23,7 +23,7 @@ from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.reshape import block2d_to_block3d, block2d_to_blocknd, factor_indexer +from pandas.core.reshape import block2d_to_blocknd, factor_indexer from pandas.core.index import Int64Index import pandas.core.common as com from pandas.tools.merge import concat From 2ce3b56d32bedcd7bf268edf624f2a6f9e448fc1 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 14 Feb 2013 07:33:51 -0500 Subject: [PATCH 8/9] TST: force rebuild --- pandas/core/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index d9ab22758e76b..19f69e3f89733 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -717,7 +717,6 @@ def _maybe_promote(dtype, fill_value=np.nan): return dtype return np.object_ - def _maybe_upcast(values): """ provide explicty type promotion and coercion """ new_dtype = _maybe_promote(values.dtype) From cb56c98de37c4d21f8f9f5bbb5b674b569d81bec Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 14 Feb 2013 14:36:23 -0500 Subject: [PATCH 9/9] CLN: change call signature of _maybe_promote (from stephenwlin branch) and _infer_dtype_from_scalar to match (both return dtype, fill_value) Diff between 'jreback/dtypes_bug' and 'stephenwlin/dtypes_bug' Conflicts: pandas/core/common.py --- pandas/core/common.py | 82 +++++++++++++++++++++++--------------- pandas/core/frame.py | 21 +++------- pandas/core/internals.py | 8 ++-- pandas/core/panel.py | 4 +- pandas/core/reshape.py | 17 ++++---- pandas/core/series.py | 9 +++-- pandas/tests/test_frame.py | 4 +- 7 files changed, 78 insertions(+), 67 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 19f69e3f89733..98a92072fe608 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -504,7 +504,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan): dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = _ensure_int64(indexer) - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype: mask = indexer == -1 needs_masking = mask.any() @@ -552,7 +552,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None): else: col_idx = _ensure_int64(col_idx) - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype: row_mask = row_idx == -1 col_mask = col_idx == -1 @@ -588,7 +588,7 @@ def diff(arr, n, axis=0): n = int(n) dtype = arr.dtype if issubclass(dtype.type, np.integer): - dtype = np.float_ + dtype = np.float64 elif issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -629,7 +629,7 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, else: indexer = _ensure_int64(indexer) if needs_masking: - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype and out is not None and out.dtype != dtype: raise Exception('Incompatible type for fill_value') else: @@ -644,16 +644,20 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, take_f(arr, indexer, out=out, fill_value=fill_value) return out + def _infer_dtype_from_scalar(val): """ interpret the dtype from a scalar, upcast floats and ints return the new value and the dtype """ + dtype = np.object_ + # a 1-element ndarray if isinstance(val, pa.Array): if val.ndim != 0: raise ValueError("invalid ndarray passed to _infer_dtype_from_scalar") - return val.item(), val.dtype + dtype = val.dtype + val = val.item() elif isinstance(val, basestring): @@ -662,67 +666,79 @@ def _infer_dtype_from_scalar(val): # so this is kind of bad. Alternately we could use np.repeat # instead of np.empty (but then you still don't want things # coming out as np.str_! - return val, np.object_ + + dtype = np.object_ elif isinstance(val, np.datetime64): # ugly hacklet - val = lib.Timestamp(val).value - return val, np.dtype('M8[ns]') + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') elif is_bool(val): - return val, np.bool_ + dtype = np.bool_ # provide implicity upcast on scalars elif is_integer(val): - return val, np.int64 + dtype = np.int64 + elif is_float(val): - return val, np.float64 + dtype = np.float64 elif is_complex(val): - return val, np.complex_ + dtype = np.complex_ - return val, np.object_ + return dtype, val def _maybe_promote(dtype, fill_value=np.nan): + # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): - # for now: refuse to upcast + # for now: refuse to upcast datetime64 # (this is because datetime64 will not implicitly upconvert # to object correctly as of numpy 1.6.1) - return dtype + if isnull(fill_value): + fill_value = tslib.iNaT + else: + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast to + # object (but numpy 1.6.1 doesn't do this properly) + fill_value = tslib.iNaT elif is_float(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, np.integer): - return np.float64 - return dtype + dtype = np.float64 elif is_bool(fill_value): - if issubclass(dtype.type, np.bool_): - return dtype - return np.object_ + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, np.integer): # upcast to prevent overflow arr = np.asarray(fill_value) if arr != arr.astype(dtype): - return arr.dtype - return dtype - return dtype + dtype = arr.dtype elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, (np.integer, np.floating)): - return np.complex_ - return dtype - return np.object_ + dtype = np.complex128 + else: + dtype = np.object_ + return dtype, fill_value -def _maybe_upcast(values): - """ provide explicty type promotion and coercion """ - new_dtype = _maybe_promote(values.dtype) +def _maybe_upcast(values, fill_value=np.nan, copy=False): + """ provide explicty type promotion and coercion + if copy == True, then a copy is created even if no upcast is required """ + + new_dtype, fill_value = _maybe_promote(values.dtype, fill_value) if new_dtype != values.dtype: values = values.astype(new_dtype) - return values + elif copy: + values = values.copy() + return values, fill_value def _possibly_cast_item(obj, item, dtype): chunk = obj[item] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bee0e0e3094b4..ecd7d57a0e4d2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -390,12 +390,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) - datacopy = ma.copy(data) - if issubclass(data.dtype.type, np.datetime64): - datacopy[mask] = tslib.iNaT - else: - datacopy = com._maybe_upcast(datacopy) - datacopy[mask] = NA + datacopy, fill_value = com._maybe_upcast(data, copy=True) + datacopy[mask] = fill_value mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype, copy=copy) elif isinstance(data, np.ndarray): @@ -437,7 +433,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if isinstance(data, basestring) and dtype is None: dtype = np.object_ if dtype is None: - data, dtype = _infer_dtype_from_scalar(data) + dtype, data = _infer_dtype_from_scalar(data) values = np.empty((len(index), len(columns)), dtype=dtype) values.fill(data) @@ -1878,7 +1874,7 @@ def set_value(self, index, col, value): new_index, new_columns = self._expand_axes((index, col)) result = self.reindex(index=new_index, columns=new_columns, copy=False) - value, likely_dtype = _infer_dtype_from_scalar(value) + likely_dtype, value = _infer_dtype_from_scalar(value) made_bigger = not np.array_equal(new_columns, self.columns) @@ -2208,7 +2204,7 @@ def _sanitize_column(self, key, value): existing_piece = self[key] # upcast the scalar - value, dtype = _infer_dtype_from_scalar(value) + dtype, value = _infer_dtype_from_scalar(value) # transpose hack if isinstance(existing_piece, DataFrame): @@ -2217,16 +2213,11 @@ def _sanitize_column(self, key, value): else: value = np.repeat(value, len(self.index)) - # special case for now (promotion) - if (com.is_float_dtype(existing_piece) and - com.is_integer_dtype(value)): - dtype = np.float64 - value = value.astype(dtype) else: # upcast the scalar - value, dtype = _infer_dtype_from_scalar(value) + dtype, value = _infer_dtype_from_scalar(value) value = np.array(np.repeat(value, len(self.index)), dtype=dtype) value = com._possibly_cast_to_datetime(value, dtype) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ddcf271dc0687..56802c2cb3bae 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -377,11 +377,11 @@ def shift(self, indexer, periods): new_values = self.values.take(indexer, axis=1) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values = com._maybe_upcast(new_values) + new_values, fill_value = com._maybe_upcast(new_values) if periods > 0: - new_values[:, :periods] = np.nan + new_values[:, :periods] = fill_value else: - new_values[:, periods:] = np.nan + new_values[:, periods:] = fill_value return make_block(new_values, self.items, self.ref_items) def where(self, func, other, cond = None, raise_on_error = True, try_cast = False): @@ -1412,7 +1412,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan): block_shape = list(self.shape) block_shape[0] = len(items) - fill_value, dtype = com._infer_dtype_from_scalar(fill_value) + dtype, fill_value = com._infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) na_block = make_block(block_values, items, ref_items) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 6fea1cc85c728..6e52193a2c025 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -658,7 +658,7 @@ def set_value(self, *args): d = self._construct_axes_dict_from(self, axes, copy=False) result = self.reindex(**d) args = list(args) - args[-1], likely_dtype = _infer_dtype_from_scalar(args[-1]) + likely_dtype, args[-1] = _infer_dtype_from_scalar(args[-1]) made_bigger = not np.array_equal( axes[0], getattr(self, self._info_axis)) # how to make this logic simpler? @@ -693,7 +693,7 @@ def __setitem__(self, key, value): assert(value.shape == shape[1:]) mat = np.asarray(value) elif np.isscalar(value): - value, dtype = _infer_dtype_from_scalar(value) + dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 32f98399bd6dd..c86273b8a1cca 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -149,8 +149,9 @@ def get_new_values(self): stride = values.shape[1] result_width = width * stride - new_values = np.empty((length, result_width), dtype=_maybe_promote(values.dtype)) - new_values.fill(np.nan) + dtype, fill_value = _maybe_promote(values.dtype) + new_values = np.empty((length, result_width), dtype=dtype) + new_values.fill(fill_value) new_mask = np.zeros((length, result_width), dtype=bool) # is there a simpler / faster way of doing this? @@ -773,12 +774,12 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): mask = np.zeros(np.prod(shape), dtype=bool) mask.put(selector, True) - pvalues = np.empty(panel_shape, dtype=values.dtype) - if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): - pvalues.fill(np.nan) - elif not mask.all(): - pvalues = _maybe_upcast(pvalues) - pvalues.fill(np.nan) + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) values = values for i in xrange(len(items)): diff --git a/pandas/core/series.py b/pandas/core/series.py index e8af3963d0f45..21109593489ad 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2818,14 +2818,15 @@ def _get_values(): return values if offset is None: - new_values = pa.empty(len(self), dtype=_maybe_promote(self.dtype)) + dtype, fill_value = _maybe_promote(self.dtype) + new_values = pa.empty(len(self), dtype=dtype) if periods > 0: new_values[periods:] = self.values[:-periods] - new_values[:periods] = nan + new_values[:periods] = fill_value elif periods < 0: new_values[:periods] = self.values[-periods:] - new_values[periods:] = nan + new_values[periods:] = fill_value return Series(new_values, index=self.index, name=self.name) elif isinstance(self.index, PeriodIndex): @@ -3129,7 +3130,7 @@ def _try_cast(arr): # figure out the dtype from the value (upcast if necessary) if dtype is None: - value, dtype = _infer_dtype_from_scalar(value) + dtype, value = _infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = com._possibly_cast_to_datetime(value, dtype) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 24883b3359c42..fd4186ed39902 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -441,8 +441,10 @@ def test_setitem_cast(self): self.assert_(self.frame['D'].dtype == np.int64) # #669, should not cast? + # this is now set to int64, which means a replacement of the column to + # the value dtype (and nothing to do with the existing dtype) self.frame['B'] = 0 - self.assert_(self.frame['B'].dtype == np.float64) + self.assert_(self.frame['B'].dtype == np.int64) # cast if pass array of course self.frame['B'] = np.arange(len(self.frame))