diff --git a/RELEASE.rst b/RELEASE.rst index e98849123c46c..25350555317bd 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -53,6 +53,9 @@ pandas 0.11.0 - Do not automatically upcast numeric specified dtypes to ``int64`` or ``float64`` (GH622_ and GH797_) + - DataFrame construction of lists and scalars, with no dtype present, will + result in casting to ``int64`` or ``float64``, regardless of platform. + This is not an apparent change in the API, but noting it. - Guarantee that ``convert_objects()`` for Series/DataFrame always returns a copy - groupby operations will respect dtypes for numeric float operations diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index d2648cbdb5a44..0162ee85ac518 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -3,7 +3,7 @@ v0.11.0 (March ??, 2013) ------------------------ -This is a minor release from 0.10.1 and includes many new features and +This is a major release from 0.10.1 and includes many new features and enhancements along with a large number of bug fixes. There are also a number of important API changes that long-time pandas users should pay close attention to. @@ -13,7 +13,8 @@ API changes Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passed (either directly via the ``dtype`` keyword, a passed ``ndarray``, or a passed ``Series``, then it will be preserved in DataFrame operations. Furthermore, different numeric dtypes will **NOT** be combined. The following example will give you a taste. -**Dtype Specification** +Dtype Specification +~~~~~~~~~~~~~~~~~~~ .. ipython:: python @@ -29,7 +30,8 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe df3 df3.dtypes -**Dtype conversion** +Dtype Conversion +~~~~~~~~~~~~~~~~ .. ipython:: python @@ -54,6 +56,26 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe Timestamp('20010104'), '20010105'],dtype='O') s.convert_objects(convert_dates='coerce') +Dtype Gotchas +~~~~~~~~~~~~~ + +**Platform Gotchas** + +Starting in 0.11.0, construction of DataFrame/Series will use default dtypes of ``int64`` and ``float64``, +*regardless of platform*. This is not an apparent change from earlier versions of pandas. If you specify +dtypes, they *WILL* be respected, however (GH2837_) + +The following will all result in ``int64`` dtypes + +.. ipython:: python + + DataFrame([1,2],columns=['a']).dtypes + DataFrame({'a' : [1,2] }).dtypes + DataFrame({'a' : 1 }, index=range(2)).dtypes + +Keep in mind that ``DataFrame(np.array([1,2]))`` **WILL** result in ``int32`` on 32-bit platforms! + + **Upcasting Gotchas** Performing indexing operations on integer type data can easily upcast the data. @@ -82,21 +104,13 @@ While float dtypes are unchanged. casted casted.dtypes -New features -~~~~~~~~~~~~ - -**Enhancements** +Datetimes Conversion +~~~~~~~~~~~~~~~~~~~~ - - In ``HDFStore``, provide dotted attribute access to ``get`` from stores (e.g. store.df == store['df']) - -**Bug Fixes** - -See the `full release notes -`__ or issue tracker -on GitHub for a complete list. - - -Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. Furthermore datetime64 columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) +Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` to indicate a nan value, +in addition to the traditional ``NaT``, or not-a-time. This allows convenient nan setting in a generic way. +Furthermore ``datetime64[ns]`` columns are created by default, when passed datetimelike objects (*this change was introduced in 0.10.1*) +(GH2809_, GH2810_) .. ipython:: python @@ -111,8 +125,7 @@ Datetime64[ns] columns in a DataFrame (or a Series) allow the use of ``np.nan`` df.ix[2:4,['A','timestamp']] = np.nan df -Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``np.nan`` - +Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`` to ``np.nan`` .. ipython:: python @@ -127,13 +140,32 @@ Astype conversion on datetime64[ns] to object, implicity converts ``NaT`` to ``n s.dtype -``Squeeze`` to possibly remove length 1 dimensions from an object. +New features +~~~~~~~~~~~~ -.. ipython:: python +**Enhancements** + + - In ``HDFStore``, provide dotted attribute access to ``get`` from stores + (e.g. store.df == store['df']) + + - ``Squeeze`` to possibly remove length 1 dimensions from an object. - p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], + .. ipython:: python + + p = Panel(randn(3,4,4),items=['ItemA','ItemB','ItemC'], major_axis=date_range('20010102',periods=4), minor_axis=['A','B','C','D']) - p - p.reindex(items=['ItemA']).squeeze() - p.reindex(items=['ItemA'],minor=['B']).squeeze() + p + p.reindex(items=['ItemA']).squeeze() + p.reindex(items=['ItemA'],minor=['B']).squeeze() + +**Bug Fixes** + +See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + +.. _GH2809: https://github.com/pydata/pandas/issues/2809 +.. _GH2810: https://github.com/pydata/pandas/issues/2810 +.. _GH2837: https://github.com/pydata/pandas/issues/2837 + diff --git a/pandas/core/common.py b/pandas/core/common.py index 7535ed68722fb..98a92072fe608 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -24,6 +24,7 @@ from pandas.util.py3compat import StringIO, BytesIO from pandas.core.config import get_option +from pandas.core import array as pa # XXX: HACK for NumPy 1.5.1 to suppress warnings try: @@ -503,7 +504,7 @@ def take_1d(arr, indexer, out=None, fill_value=np.nan): dtype, fill_value = arr.dtype, arr.dtype.type() else: indexer = _ensure_int64(indexer) - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype: mask = indexer == -1 needs_masking = mask.any() @@ -551,7 +552,7 @@ def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan, out=None): else: col_idx = _ensure_int64(col_idx) - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype: row_mask = row_idx == -1 col_mask = col_idx == -1 @@ -587,7 +588,7 @@ def diff(arr, n, axis=0): n = int(n) dtype = arr.dtype if issubclass(dtype.type, np.integer): - dtype = np.float_ + dtype = np.float64 elif issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -628,7 +629,7 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, else: indexer = _ensure_int64(indexer) if needs_masking: - dtype = _maybe_promote(arr.dtype, fill_value) + dtype = _maybe_promote(arr.dtype, fill_value)[0] if dtype != arr.dtype and out is not None and out.dtype != dtype: raise Exception('Incompatible type for fill_value') else: @@ -644,49 +645,110 @@ def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, return out +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar, upcast floats and ints + return the new value and the dtype """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, pa.Array): + if val.ndim != 0: + raise ValueError("invalid ndarray passed to _infer_dtype_from_scalar") + + dtype = val.dtype + val = val.item() + + elif isinstance(val, basestring): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, np.datetime64): + # ugly hacklet + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') + + elif is_bool(val): + dtype = np.bool_ + + # provide implicity upcast on scalars + elif is_integer(val): + dtype = np.int64 + + elif is_float(val): + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + return dtype, val + def _maybe_promote(dtype, fill_value=np.nan): + # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): - # for now: refuse to upcast + # for now: refuse to upcast datetime64 # (this is because datetime64 will not implicitly upconvert # to object correctly as of numpy 1.6.1) - return dtype + if isnull(fill_value): + fill_value = tslib.iNaT + else: + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast to + # object (but numpy 1.6.1 doesn't do this properly) + fill_value = tslib.iNaT elif is_float(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, np.integer): - return np.float_ - return dtype + dtype = np.float64 elif is_bool(fill_value): - if issubclass(dtype.type, np.bool_): - return dtype - return np.object_ + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, np.integer): # upcast to prevent overflow arr = np.asarray(fill_value) if arr != arr.astype(dtype): - return arr.dtype - return dtype - return dtype + dtype = arr.dtype elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): - return np.object_ + dtype = np.object_ elif issubclass(dtype.type, (np.integer, np.floating)): - return np.complex_ - return dtype - return np.object_ + dtype = np.complex128 + else: + dtype = np.object_ + return dtype, fill_value +def _maybe_upcast(values, fill_value=np.nan, copy=False): + """ provide explicty type promotion and coercion + if copy == True, then a copy is created even if no upcast is required """ + + new_dtype, fill_value = _maybe_promote(values.dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + return values, fill_value + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) -def _maybe_upcast(values): - # TODO: convert remaining usage of _maybe_upcast to _maybe_promote - if issubclass(values.dtype.type, np.integer): - values = values.astype(np.float_) - elif issubclass(values.dtype.type, np.bool_): - values = values.astype(np.object_) - return values - def _interp_wrapper(f, wrap_dtype, na_override=None): def wrapper(arr, mask, limit=None): @@ -808,7 +870,8 @@ def _consensus_name_attr(objs): def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): """ if we have an object dtype, try to coerce dates and/or numers """ - if values.dtype == np.object_ and convert_dates: + # convert dates + if convert_dates and values.dtype == np.object_: # we take an aggressive stance and convert to datetime64[ns] if convert_dates == 'coerce': @@ -821,7 +884,8 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): else: values = lib.maybe_convert_objects(values, convert_datetime=convert_dates) - if values.dtype == np.object_ and convert_numeric: + # convert to numeric + if convert_numeric and values.dtype == np.object_: try: new_values = lib.maybe_convert_numeric(values,set(),coerce_numeric=True) @@ -834,6 +898,16 @@ def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True): return values +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list,tuple)): + values = lib.list_to_object_array(values) + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + + return values + def _possibly_cast_to_datetime(value, dtype, coerce = False): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -876,29 +950,6 @@ def _possibly_cast_to_datetime(value, dtype, coerce = False): return value -def _infer_dtype(value): - if isinstance(value, (float, np.floating)): - return np.float_ - elif isinstance(value, (bool, np.bool_)): - return np.bool_ - elif isinstance(value, (int, long, np.integer)): - return np.int_ - elif isinstance(value, (complex, np.complexfloating)): - return np.complex_ - else: - return np.object_ - - -def _possibly_cast_item(obj, item, dtype): - chunk = obj[item] - - if chunk.values.dtype != dtype: - if dtype in (np.object_, np.bool_): - obj[item] = chunk.astype(np.object_) - elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: %s" % dtype) - - def _is_bool_indexer(key): if isinstance(key, np.ndarray) and key.dtype == np.object_: key = np.asarray(key) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ecf2f8ba482f6..ecd7d57a0e4d2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,13 +23,13 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _is_sequence) + _default_index, _is_sequence, _infer_dtype_from_scalar) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, _is_index_slice, _check_bool_indexer) from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.series import Series, _radd_compat, _dtype_from_scalar +from pandas.core.series import Series, _radd_compat from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -390,12 +390,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) - datacopy = ma.copy(data) - if issubclass(data.dtype.type, np.datetime64): - datacopy[mask] = tslib.iNaT - else: - datacopy = com._maybe_upcast(datacopy) - datacopy[mask] = NA + datacopy, fill_value = com._maybe_upcast(data, copy=True) + datacopy[mask] = fill_value mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype, copy=copy) elif isinstance(data, np.ndarray): @@ -437,7 +433,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, if isinstance(data, basestring) and dtype is None: dtype = np.object_ if dtype is None: - data, dtype = _dtype_from_scalar(data) + dtype, data = _infer_dtype_from_scalar(data) values = np.empty((len(index), len(columns)), dtype=dtype) values.fill(data) @@ -1234,7 +1230,7 @@ def to_panel(self): panel : Panel """ from pandas.core.panel import Panel - from pandas.core.reshape import block2d_to_block3d + from pandas.core.reshape import block2d_to_blocknd # only support this kind for now if (not isinstance(self.index, MultiIndex) or @@ -1261,8 +1257,8 @@ def to_panel(self): new_blocks = [] for block in selfsorted._data.blocks: - newb = block2d_to_block3d(block.values.T, block.items, shape, - major_labels, minor_labels, + newb = block2d_to_blocknd(block.values.T, block.items, shape, + [ major_labels, minor_labels ], ref_items=selfsorted.columns) new_blocks.append(newb) @@ -1878,7 +1874,7 @@ def set_value(self, index, col, value): new_index, new_columns = self._expand_axes((index, col)) result = self.reindex(index=new_index, columns=new_columns, copy=False) - likely_dtype = com._infer_dtype(value) + likely_dtype, value = _infer_dtype_from_scalar(value) made_bigger = not np.array_equal(new_columns, self.columns) @@ -2207,6 +2203,9 @@ def _sanitize_column(self, key, value): if key in self.columns: existing_piece = self[key] + # upcast the scalar + dtype, value = _infer_dtype_from_scalar(value) + # transpose hack if isinstance(existing_piece, DataFrame): shape = (len(existing_piece.columns), len(self.index)) @@ -2214,14 +2213,14 @@ def _sanitize_column(self, key, value): else: value = np.repeat(value, len(self.index)) - # special case for now - if (com.is_float_dtype(existing_piece) and - com.is_integer_dtype(value)): - value = value.astype(np.float64) + value = value.astype(dtype) else: - value = np.repeat(value, len(self.index)) + # upcast the scalar + dtype, value = _infer_dtype_from_scalar(value) + value = np.array(np.repeat(value, len(self.index)), dtype=dtype) + value = com._possibly_cast_to_datetime(value, dtype) return np.atleast_2d(np.asarray(value)) def pop(self, item): @@ -5460,11 +5459,17 @@ def _prep_ndarray(values, copy=True): if len(values) == 0: return np.empty((0, 0), dtype=object) - arr = np.asarray(values) - # NumPy strings are a pain, convert to object - if issubclass(arr.dtype.type, basestring): - arr = np.array(values, dtype=object, copy=True) - values = arr + def convert(v): + return com._possibly_convert_platform(v) + + # we could have a 1-dim or 2-dim list here + # this is equiv of np.asarray, but does object conversion + # and platform dtype preservation + if com.is_list_like(values[0]) or hasattr(values[0],'len'): + values = np.array([ convert(v) for v in values]) + else: + values = convert(values) + else: # drop subclass info, do not copy data values = np.asarray(values) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index ee024ce68b5b4..56802c2cb3bae 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -377,11 +377,11 @@ def shift(self, indexer, periods): new_values = self.values.take(indexer, axis=1) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values = com._maybe_upcast(new_values) + new_values, fill_value = com._maybe_upcast(new_values) if periods > 0: - new_values[:, :periods] = np.nan + new_values[:, :periods] = fill_value else: - new_values[:, periods:] = np.nan + new_values[:, periods:] = fill_value return make_block(new_values, self.items, self.ref_items) def where(self, func, other, cond = None, raise_on_error = True, try_cast = False): @@ -1412,7 +1412,7 @@ def _make_na_block(self, items, ref_items, fill_value=np.nan): block_shape = list(self.shape) block_shape[0] = len(items) - dtype = com._infer_dtype(fill_value) + dtype, fill_value = com._infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) na_block = make_block(block_values, items, ref_items) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 6b867f9a643db..6e52193a2c025 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -7,7 +7,7 @@ import sys import numpy as np from pandas.core.common import (PandasError, _mut_exclusive, - _try_sort, _default_index, _infer_dtype, + _try_sort, _default_index, _infer_dtype_from_scalar, notnull) from pandas.core.categorical import Factor from pandas.core.index import (Index, MultiIndex, _ensure_index, @@ -657,8 +657,8 @@ def set_value(self, *args): axes = self._expand_axes(args) d = self._construct_axes_dict_from(self, axes, copy=False) result = self.reindex(**d) - - likely_dtype = com._infer_dtype(args[-1]) + args = list(args) + likely_dtype, args[-1] = _infer_dtype_from_scalar(args[-1]) made_bigger = not np.array_equal( axes[0], getattr(self, self._info_axis)) # how to make this logic simpler? @@ -693,7 +693,7 @@ def __setitem__(self, key, value): assert(value.shape == shape[1:]) mat = np.asarray(value) elif np.isscalar(value): - dtype = _infer_dtype(value) + dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) else: diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 362215703e1f2..c86273b8a1cca 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -9,7 +9,8 @@ from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical -from pandas.core.common import notnull, _ensure_platform_int +from pandas.core.common import (notnull, _ensure_platform_int, _maybe_promote, + _maybe_upcast) from pandas.core.groupby import (get_group_index, _compress_group_index, decons_group_index) import pandas.core.common as com @@ -148,12 +149,11 @@ def get_new_values(self): stride = values.shape[1] result_width = width * stride - new_values = np.empty((length, result_width), dtype=values.dtype) + dtype, fill_value = _maybe_promote(values.dtype) + new_values = np.empty((length, result_width), dtype=dtype) + new_values.fill(fill_value) new_mask = np.zeros((length, result_width), dtype=bool) - new_values = com._maybe_upcast(new_values) - new_values.fill(np.nan) - # is there a simpler / faster way of doing this? for i in xrange(values.shape[1]): chunk = new_values[:, i * width: (i + 1) * width] @@ -761,40 +761,6 @@ def make_axis_dummies(frame, axis='minor', transform=None): return DataFrame(values, columns=items, index=frame.index) -def block2d_to_block3d(values, items, shape, major_labels, minor_labels, - ref_items=None): - """ - Developer method for pivoting DataFrame -> Panel. Used in HDFStore and - DataFrame.to_panel - """ - from pandas.core.internals import make_block - panel_shape = (len(items),) + shape - - # TODO: lexsort depth needs to be 2!! - - # Create observation selection vector using major and minor - # labels, for converting to panel format. - selector = minor_labels + shape[1] * major_labels - mask = np.zeros(np.prod(shape), dtype=bool) - mask.put(selector, True) - - pvalues = np.empty(panel_shape, dtype=values.dtype) - if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): - pvalues.fill(np.nan) - elif not mask.all(): - pvalues = com._maybe_upcast(pvalues) - pvalues.fill(np.nan) - - values = values - for i in xrange(len(items)): - pvalues[i].flat[mask] = values[:, i] - - if ref_items is None: - ref_items = items - - return make_block(pvalues, items, ref_items) - - def block2d_to_blocknd(values, items, shape, labels, ref_items=None): """ pivot to the labels shape """ from pandas.core.internals import make_block @@ -808,12 +774,12 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None): mask = np.zeros(np.prod(shape), dtype=bool) mask.put(selector, True) - pvalues = np.empty(panel_shape, dtype=values.dtype) - if not issubclass(pvalues.dtype.type, (np.integer, np.bool_)): - pvalues.fill(np.nan) - elif not mask.all(): - pvalues = com._maybe_upcast(pvalues) - pvalues.fill(np.nan) + if mask.all(): + pvalues = np.empty(panel_shape, dtype=values.dtype) + else: + dtype, fill_value = _maybe_promote(values.dtype) + pvalues = np.empty(panel_shape, dtype=dtype) + pvalues.fill(fill_value) values = values for i in xrange(len(items)): diff --git a/pandas/core/series.py b/pandas/core/series.py index bb154896651cd..21109593489ad 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -15,8 +15,9 @@ import numpy.ma as ma from pandas.core.common import (isnull, notnull, _is_bool_indexer, - _default_index, _maybe_upcast, - _asarray_tuplesafe, is_integer_dtype) + _default_index, _maybe_promote, + _asarray_tuplesafe, is_integer_dtype, + _infer_dtype_from_scalar) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import _SeriesIndexer, _check_bool_indexer @@ -2817,15 +2818,15 @@ def _get_values(): return values if offset is None: - new_values = pa.empty(len(self), dtype=self.dtype) - new_values = _maybe_upcast(new_values) + dtype, fill_value = _maybe_promote(self.dtype) + new_values = pa.empty(len(self), dtype=dtype) if periods > 0: new_values[periods:] = self.values[:-periods] - new_values[:periods] = nan + new_values[:periods] = fill_value elif periods < 0: new_values[:periods] = self.values[-periods:] - new_values[periods:] = nan + new_values[periods:] = fill_value return Series(new_values, index=self.index, name=self.name) elif isinstance(self.index, PeriodIndex): @@ -3111,11 +3112,12 @@ def _try_cast(arr): raise subarr = pa.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) - subarr = com._possibly_cast_to_datetime(subarr, dtype) + else: - subarr = lib.list_to_object_array(data) - subarr = lib.maybe_convert_objects(subarr) - subarr = com._possibly_cast_to_datetime(subarr, dtype) + subarr = com._possibly_convert_platform(data) + + subarr = com._possibly_cast_to_datetime(subarr, dtype) + else: subarr = _try_cast(data) @@ -3126,29 +3128,16 @@ def _try_cast(arr): elif index is not None: value = data - # If we create an empty array using a string to infer - # the dtype, NumPy will only allocate one character per entry - # so this is kind of bad. Alternately we could use np.repeat - # instead of np.empty (but then you still don't want things - # coming out as np.str_! - if isinstance(value, basestring) and dtype is None: - dtype = np.object_ - + # figure out the dtype from the value (upcast if necessary) if dtype is None: - - # a 1-element ndarray - if isinstance(value, pa.Array): - dtype = value.dtype - value = value.item() - else: - value, dtype = _dtype_from_scalar(value) - - subarr = pa.empty(len(index), dtype=dtype) + dtype, value = _infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = com._possibly_cast_to_datetime(value, dtype) - subarr = pa.empty(len(index), dtype=dtype) + + subarr = pa.empty(len(index), dtype=dtype) subarr.fill(value) + else: return subarr.item() @@ -3176,14 +3165,6 @@ def _try_cast(arr): return subarr -def _dtype_from_scalar(val): - if isinstance(val, np.datetime64): - # ugly hacklet - val = lib.Timestamp(val).value - return val, np.dtype('M8[ns]') - return val, type(val) - - def _get_rename_function(mapper): if isinstance(mapper, (dict, Series)): def f(x): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 84c2ef4957529..b56b6c5e5923f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -23,7 +23,7 @@ from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe, _try_sort from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.reshape import block2d_to_block3d, block2d_to_blocknd, factor_indexer +from pandas.core.reshape import block2d_to_blocknd, factor_indexer from pandas.core.index import Int64Index import pandas.core.common as com from pandas.tools.merge import concat diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index c628bf3f0df97..fd4186ed39902 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -372,6 +372,11 @@ def test_setitem(self): self.assertEqual(smaller['col10'].dtype, np.object_) self.assert_((smaller['col10'] == ['1', '2']).all()) + # with a dtype + for dtype in ['int32','int64','float32','float64']: + self.frame[dtype] = np.array(arr,dtype=dtype) + self.assert_(self.frame[dtype].dtype.name == dtype) + def test_setitem_tuple(self): self.frame['A', 'B'] = self.frame['A'] assert_series_equal(self.frame['A', 'B'], self.frame['A']) @@ -436,8 +441,10 @@ def test_setitem_cast(self): self.assert_(self.frame['D'].dtype == np.int64) # #669, should not cast? + # this is now set to int64, which means a replacement of the column to + # the value dtype (and nothing to do with the existing dtype) self.frame['B'] = 0 - self.assert_(self.frame['B'].dtype == np.float_) + self.assert_(self.frame['B'].dtype == np.int64) # cast if pass array of course self.frame['B'] = np.arange(len(self.frame)) @@ -445,18 +452,18 @@ def test_setitem_cast(self): self.frame['foo'] = 'bar' self.frame['foo'] = 0 - self.assert_(self.frame['foo'].dtype == np.int_) + self.assert_(self.frame['foo'].dtype == np.int64) self.frame['foo'] = 'bar' self.frame['foo'] = 2.5 - self.assert_(self.frame['foo'].dtype == np.float_) + self.assert_(self.frame['foo'].dtype == np.float64) self.frame['something'] = 0 - self.assert_(self.frame['something'].dtype == np.int_) + self.assert_(self.frame['something'].dtype == np.int64) self.frame['something'] = 2 - self.assert_(self.frame['something'].dtype == np.int_) + self.assert_(self.frame['something'].dtype == np.int64) self.frame['something'] = 2.5 - self.assert_(self.frame['something'].dtype == np.float_) + self.assert_(self.frame['something'].dtype == np.float64) def test_setitem_boolean_column(self): expected = self.frame.copy() @@ -490,8 +497,12 @@ def test_setitem_corner(self): self.assertEqual(len(dm.columns), 2) self.assertEqual(dm.values.dtype, np.object_) + # upcast dm['C'] = 1 - self.assertEqual(dm['C'].dtype, np.int_) + self.assertEqual(dm['C'].dtype, np.int64) + + dm['E'] = 1. + self.assertEqual(dm['E'].dtype, np.float64) # set existing column dm['A'] = 'bar' @@ -2369,9 +2380,9 @@ def test_constructor_scalar_inference(self): 'float': 3., 'complex': 4j, 'object': 'foo'} df = DataFrame(data, index=np.arange(10)) - self.assert_(df['int'].dtype == np.int_) + self.assert_(df['int'].dtype == np.int64) self.assert_(df['bool'].dtype == np.bool_) - self.assert_(df['float'].dtype == np.float_) + self.assert_(df['float'].dtype == np.float64) self.assert_(df['complex'].dtype == np.complex128) self.assert_(df['object'].dtype == np.object_) @@ -2689,9 +2700,16 @@ def test_constructor_column_duplicates(self): columns=['b', 'a', 'a']) def test_constructor_single_value(self): + + # expecting single value upcasting here df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape), df.index, + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('float64'), df.index, df.columns)) + + df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c']) + assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), df.index, + df.columns)) + df = DataFrame('a', index=[1, 2], columns=['a', 'c']) assert_frame_equal(df, DataFrame(np.array([['a', 'a'], @@ -2705,6 +2723,136 @@ def test_constructor_single_value(self): self.assertRaises( com.PandasError, DataFrame, 'a', [1, 2], ['a', 'c'], float) + + def test_constructor_with_datetimes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # single item + df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, + index=np.arange(10)) + result = df.get_dtype_counts() + expected = Series({'int64': 1, datetime64name: 2, objectname : 2}) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array(1.,dtype=floatname), + intname : np.array(1,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + expected = { objectname : 1 } + if intname == 'int64': + expected['int64'] = 2 + else: + expected['int64'] = 1 + expected[intname] = 1 + if floatname == 'float64': + expected['float64'] = 2 + else: + expected['float64'] = 1 + expected[floatname] = 1 + + result.sort() + expected = Series(expected) + expected.sort() + assert_series_equal(result, expected) + + # check with ndarray construction ndim>0 + df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array([1.]*10,dtype=floatname), + intname : np.array([1]*10,dtype=intname)}, index=np.arange(10)) + result = df.get_dtype_counts() + result.sort() + assert_series_equal(result, expected) + + # GH 2809 + from pandas import date_range + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + datetime_s = Series(datetimes) + self.assert_(datetime_s.dtype == 'M8[ns]') + df = DataFrame({'datetime_s':datetime_s}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1 }) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + # GH 2810 + ind = date_range(start="2000-01-01", freq="D", periods=10) + datetimes = [ts.to_pydatetime() for ts in ind] + dates = [ts.date() for ts in ind] + df = DataFrame({'datetimes': datetimes, 'dates':dates}) + result = df.get_dtype_counts() + expected = Series({ datetime64name : 1, objectname : 1 }) + result.sort() + expected.sort() + assert_series_equal(result, expected) + + def test_constructor_for_list_with_dtypes(self): + intname = np.dtype(np.int_).name + floatname = np.dtype(np.float_).name + datetime64name = np.dtype('M8[ns]').name + objectname = np.dtype(np.object_).name + + # test list of lists/ndarrays + df = DataFrame([np.arange(5) for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int64' : 5}) + + df = DataFrame([np.array(np.arange(5),dtype='int32') for x in range(5)]) + result = df.get_dtype_counts() + expected = Series({'int32' : 5}) + + # overflow issue? (we always expecte int64 upcasting here) + df = DataFrame({'a' : [2**31,2**31+1]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1 }) + assert_series_equal(result, expected) + + # GH #2751 (construction with no index specified), make sure we cast to platform values + df = DataFrame([1, 2]) + result = df.get_dtype_counts() + expected = Series({'int64': 1 }) + assert_series_equal(result, expected) + + df = DataFrame([1.,2.]) + result = df.get_dtype_counts() + expected = Series({'float64' : 1 }) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1, 2]}) + result = df.get_dtype_counts() + expected = Series({'int64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : [1., 2.]}) + result = df.get_dtype_counts() + expected = Series({'float64' : 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1 }, index=range(3)) + result = df.get_dtype_counts() + expected = Series({'int64': 1}) + assert_series_equal(result, expected) + + df = DataFrame({'a' : 1. }, index=range(3)) + result = df.get_dtype_counts() + expected = Series({'float64': 1 }) + assert_series_equal(result, expected) + + # with object list + df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], + 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)], + 'e' : [1.,2,4.,7]}) + result = df.get_dtype_counts() + expected = Series({'int64': 1, 'float64' : 2, datetime64name: 1, objectname : 1}) + result.sort() + expected.sort() + assert_series_equal(result, expected) + def test_new_empty_index(self): df1 = DataFrame(randn(0, 3)) df2 = DataFrame(randn(0, 3)) @@ -7156,7 +7304,7 @@ def test_get_numeric_data(self): df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f' : Timestamp('20010102')}, index=np.arange(10)) result = df.get_dtype_counts() - expected = Series({intname: 1, floatname : 1, datetime64name: 1, objectname : 1}) + expected = Series({'int64': 1, 'float64' : 1, datetime64name: 1, objectname : 1}) result.sort() expected.sort() assert_series_equal(result, expected) @@ -8099,74 +8247,6 @@ def test_as_matrix_lcd(self): values = self.mixed_int.as_matrix(['C']) self.assert_(values.dtype == np.uint8) - - def test_constructor_with_datetimes(self): - intname = np.dtype(np.int_).name - floatname = np.dtype(np.float_).name - datetime64name = np.dtype('M8[ns]').name - objectname = np.dtype(np.object_).name - - # single item - df = DataFrame({'A' : 1, 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime(2001,1,2,0,0) }, - index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 1, datetime64name: 2, objectname : 2}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 ndarray with a dtype specified) - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array(1.,dtype=floatname), - intname : np.array(1,dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 2, floatname : 2, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # check with ndarray construction ndim>0 - df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', floatname : np.array([1.]*10,dtype=floatname), - intname : np.array([1]*10,dtype=intname)}, index=np.arange(10)) - result = df.get_dtype_counts() - expected = Series({intname: 2, floatname : 2, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # GH #2751 (construction with no index specified) - df = DataFrame({'a':[1,2,4,7], 'b':[1.2, 2.3, 5.1, 6.3], 'c':list('abcd'), 'd':[datetime(2000,1,1) for i in range(4)] }) - result = df.get_dtype_counts() - # TODO: fix this on 32-bit (or decide it's ok behavior?) - # expected = Series({intname: 1, floatname : 1, datetime64name: 1, objectname : 1}) - expected = Series({'int64': 1, floatname : 1, datetime64name: 1, objectname : 1}) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # GH 2809 - from pandas import date_range - ind = date_range(start="2000-01-01", freq="D", periods=10) - datetimes = [ts.to_pydatetime() for ts in ind] - datetime_s = Series(datetimes) - self.assert_(datetime_s.dtype == 'M8[ns]') - df = DataFrame({'datetime_s':datetime_s}) - result = df.get_dtype_counts() - expected = Series({ datetime64name : 1 }) - result.sort() - expected.sort() - assert_series_equal(result, expected) - - # GH 2810 - ind = date_range(start="2000-01-01", freq="D", periods=10) - datetimes = [ts.to_pydatetime() for ts in ind] - dates = [ts.date() for ts in ind] - df = DataFrame({'datetimes': datetimes, 'dates':dates}) - result = df.get_dtype_counts() - expected = Series({ datetime64name : 1, objectname : 1 }) - result.sort() - expected.sort() - assert_series_equal(result, expected) - def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) cop['A'] = 5 diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 07a02f18d8337..da7a0f68b3eb4 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -418,7 +418,7 @@ def test_setitem(self): # scalar self.panel['ItemG'] = 1 self.panel['ItemE'] = True - self.assert_(self.panel['ItemG'].values.dtype == np.int_) + self.assert_(self.panel['ItemG'].values.dtype == np.int64) self.assert_(self.panel['ItemE'].values.dtype == np.bool_) # object dtype diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 87bfba7c55cce..5bb452deb1d4d 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -358,7 +358,7 @@ def test_setitem(self): # scalar self.panel4d['lG'] = 1 self.panel4d['lE'] = True - self.assert_(self.panel4d['lG'].values.dtype == np.int_) + self.assert_(self.panel4d['lG'].values.dtype == np.int64) self.assert_(self.panel4d['lE'].values.dtype == np.bool_) # object dtype diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 8820d43975885..d1c4710c16aad 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -287,7 +287,7 @@ def test_join_index_mixed(self): df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, index=np.arange(10), columns=['A', 'B', 'C', 'D']) - self.assert_(df1['B'].dtype == np.int) + self.assert_(df1['B'].dtype == np.int64) self.assert_(df1['D'].dtype == np.bool_) df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True},