diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a56212328f5c3..ae0f2181a9850 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -792,6 +792,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``DataFrame.values`` now returns object dtyped numpy array of ``Timestamp`` for tz-aware columns; previously this returned ``DateTimeIndex`` (:issue:`14052`) - Bug in ``Timestamp.replace`` now raises ``TypeError`` when incorrect argument names are given; previously this raised ``ValueError`` (:issue:`15240`) - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1db9677659ca3..d1de9d0bfa01a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3076,7 +3076,9 @@ def values(self): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcast to int32. By numpy.find_common_type convention, mixing int64 and uint64 - will result in a flot64 dtype. + will result in a float64 dtype. + + Unlike ``Series.values``, tz-aware dtypes will be upcasted to object. """ return self.as_matrix() @@ -5098,6 +5100,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): other = com._apply_if_callable(other, self) + return self._where(cond, other, inplace, axis, level, try_cast, raise_on_error) @@ -5783,7 +5786,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, **kwargs)) - 1) if freq is None: mask = isnull(_values_from_object(self)) - np.putmask(rs.values, mask, np.nan) + rs.iloc[mask] = np.nan return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0e6c176d950a1..cfa991eefef16 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -2388,9 +2388,15 @@ def get_values(self, dtype=None): # return object dtype as Timestamps with the zones if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) - return lib.map_infer( + values = lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) - return self.values + + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + else: + return self.values + + return values def to_object_block(self, mgr): """ @@ -3424,10 +3430,7 @@ def as_matrix(self, items=None): else: mgr = self - if self._is_single_block or not self.is_mixed_type: - return mgr.blocks[0].get_values() - else: - return mgr._interleave() + return mgr._interleave() def _interleave(self): """ @@ -3436,6 +3439,10 @@ def _interleave(self): """ dtype = _interleaved_dtype(self.blocks) + if self._is_single_block or not self.is_mixed_type: + return np.array(self.blocks[0].get_values(dtype), + dtype=dtype, copy=False) + result = np.empty(self.shape, dtype=dtype) if result.shape[0] == 0: @@ -4485,33 +4492,64 @@ def _interleaved_dtype(blocks): for x in blocks: counts[type(x)].append(x) - have_int = len(counts[IntBlock]) > 0 have_bool = len(counts[BoolBlock]) > 0 have_object = len(counts[ObjectBlock]) > 0 + have_int = len(counts[IntBlock]) > 0 have_float = len(counts[FloatBlock]) > 0 have_complex = len(counts[ComplexBlock]) > 0 have_dt64 = len(counts[DatetimeBlock]) > 0 have_dt64_tz = len(counts[DatetimeTZBlock]) > 0 have_td64 = len(counts[TimeDeltaBlock]) > 0 - have_cat = len(counts[CategoricalBlock]) > 0 + have_cat = len(counts[CategoricalBlock]) # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa - have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat + have_numeric = have_float + have_complex + have_int + have_dt = have_dt64 + have_dt64_tz + have_non_numeric = have_dt64 + have_dt64_tz + have_td64 + have_cat + have_non_dt = have_td64 + have_cat + have_mixed = bool(have_numeric) + bool(have_non_dt) + bool(have_dt) if (have_object or - (have_bool and - (have_numeric or have_dt64 or have_dt64_tz or have_td64)) or - (have_numeric and has_non_numeric) or have_cat or have_dt64 or - have_dt64_tz or have_td64): + (have_non_numeric > 1) or # more than one type of non numeric + (have_bool and have_mixed) or # mix of a numeric et non numeric + (have_mixed > 1) or # mix of a numeric et non numeric + have_dt64_tz or + (have_cat > 1)): return np.dtype(object) + elif have_dt64: + return np.dtype("datetime64[ns]") + elif have_td64: + return np.dtype("timedelta64[ns]") elif have_bool: - return np.dtype(bool) + return np.dtype("bool") + elif have_cat: + # return blocks[0].get_values().dtype + # if we are mixing unsigned and signed, then return + # the next biggest int type (if we can) + + dts = [b.get_values().dtype for b in counts[CategoricalBlock]] + lcd = _find_common_type(dts) + kinds = set([_dt.kind for _dt in dts]) + + if len(kinds) == 1: + return lcd + + if lcd == 'uint64' or lcd == 'int64': + return np.dtype('int64') + + # return 1 bigger on the itemsize if unsinged + if lcd.kind == 'u': + return np.dtype('int%s' % (lcd.itemsize * 8 * 2)) + return lcd + elif have_int and not have_float and not have_complex: # if we are mixing unsigned and signed, then return # the next biggest int type (if we can) - lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) - kinds = set([i.dtype.kind for i in counts[IntBlock]]) + + dts = [b.dtype for b in counts[IntBlock]] + lcd = _find_common_type(dts) + kinds = set([_dt.kind for _dt in dts]) + if len(kinds) == 1: return lcd diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index f7d2c1a654cd5..3d399e7b848bf 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,23 +1,25 @@ # -*- coding: utf-8 -*- from __future__ import print_function + +import itertools from datetime import timedelta import numpy as np + +import pandas as pd +import pandas.util.testing as tm from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, concat, option_context) from pandas.compat import u -from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData +from pandas.types.dtypes import DatetimeTZDtype from pandas.util.testing import (assert_series_equal, assert_frame_equal, makeCustomDataframe as mkdf) -import pandas.util.testing as tm -import pandas as pd class TestDataFrameDataTypes(tm.TestCase, TestData): - def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df['a'] = df['a'].astype(np.bool_) @@ -198,7 +200,7 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) with tm.assertRaisesRegexp(ValueError, 'at least one of include or ' - 'exclude must be nonempty'): + 'exclude must be nonempty'): df.select_dtypes() def test_select_dtypes_raises_on_string(self): @@ -536,7 +538,6 @@ def test_arg_for_errors_in_astype(self): class TestDataFrameDatetimeWithTZ(tm.TestCase, TestData): - def test_interleave(self): # interleave with object @@ -622,3 +623,77 @@ def test_astype_str(self): 'NaT NaT' in result) self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 ' '2013-01-03 00:00:00+01:00' in result) + + def test_values_is_ndarray_with_datetime64tz(self): + df = DataFrame({ + 'A': date_range('20130101', periods=3), + 'B': date_range('20130101', periods=3, tz='US/Eastern'), + }) + + for col in [ + ["A"], + ["A", "A"], + ["A", "B"], + ["B", "B"], + ["B"], + ]: + arr = df[col].values + dtype_expected = "