diff --git a/doc/source/release.rst b/doc/source/release.rst index 08bfcbe42ad5b..f5997e2c35e72 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -107,6 +107,10 @@ API Changes or numbering columns as needed (:issue:`2385`) - Slicing and advanced/boolean indexing operations on ``Index`` classes will no longer change type of the resulting index (:issue:`6440`). +- ``set_index`` no longer converts MultiIndexes to an Index of tuples (:issue:`6459`). +- Slicing with negative start, stop & step values handles corner cases better (:issue:`6531`): + - ``df.iloc[:-len(df)]`` is now empty + - ``df.iloc[len(df)::-1]`` now enumerates all elements in reverse Experimental Features ~~~~~~~~~~~~~~~~~~~~~ @@ -139,6 +143,7 @@ Improvements to existing features Bug Fixes ~~~~~~~~~ +- Bug in Series ValueError when index doesn't match data (:issue:`6532`) - Bug in ``pd.DataFrame.sort_index`` where mergesort wasn't stable when ``ascending=False`` (:issue:`6399`) - Bug in ``pd.tseries.frequencies.to_offset`` when argument has leading zeroes (:issue:`6391`) - Bug in version string gen. for dev versions with shallow clones / install from tarball (:issue:`6127`) @@ -180,7 +185,7 @@ Bug Fixes - Bug in :meth:`DataFrame.replace` where nested dicts were erroneously depending on the order of dictionary keys and values (:issue:`5338`). - Perf issue in concatting with empty objects (:issue:`3259`) -- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:isssue:`6444`) +- Clarify sorting of ``sym_diff`` on ``Index``es with ``NaN``s (:issue:`6444`) - Regression in ``MultiIndex.from_product`` with a ``DatetimeIndex`` as input (:issue:`6439`) - Bug in ``str.extract`` when passed a non-default index (:issue:`6348`) - Bug in ``str.split`` when passed ``pat=None`` and ``n=1`` (:issue:`6466`) @@ -194,6 +199,16 @@ Bug Fixes - Bug in ``read_html`` tests where redirected invalid URLs would make one test fail (:issue:`6445`). - Bug in multi-axis indexing using ``.loc`` on non-unique indices (:issue:`6504`) +- Bug that caused _ref_locs corruption when slice indexing across columns axis of a DataFrame (:issue:`6525`) +- Regression from 0.13 in the treatmenet of numpy ``datetime64`` non-ns dtypes in Series creation (:issue:`6529`) +- ``.names`` attribute of MultiIndexes passed to ``set_index`` are now preserved (:issue:`6459`). +- Bug in setitem with a duplicate index and an alignable rhs (:issue:`6541`) +- Bug in setitem with loc on mixed integer Indexes (:issue:`6546`) +- Bug in ``pd.read_stata`` which would use the wrong data types and missing values (:issue:`6327`) +- Bug in ``DataFrame.to_stata`` that lead to data loss in certain cases, and could exported using the + wrong data types and missing values (:issue:`6335`) +- Inconsistent types in Timestamp addition/subtraction (:issue:`6543`) + pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 4432e9e891e7d..310047545d84e 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -92,6 +92,49 @@ These are out-of-bounds selections .. ipython:: python i[[0,1,2]].astype(np.int_) +- ``set_index`` no longer converts MultiIndexes to an Index of tuples. For example, + the old behavior returned an Index in this case (:issue:`6459`): + + .. ipython:: python + :suppress: + + from itertools import product + tuples = list(product(('a', 'b'), ('c', 'd'))) + mi = MultiIndex.from_tuples(tuples) + df_multi = DataFrame(np.random.randn(4, 2), index=mi) + tuple_ind = pd.Index(tuples) + + .. ipython:: python + + df_multi.index + + @suppress + df_multi.index = tuple_ind + + # Old behavior, casted MultiIndex to an Index + df_multi.set_index(df_multi.index) + + @suppress + df_multi.index = mi + + # New behavior + df_multi.set_index(df_multi.index) + + This also applies when passing multiple indices to ``set_index``: + + .. ipython:: python + + @suppress + df_multi.index = tuple_ind + + # Old output, 2-level MultiIndex of tuples + df_multi.set_index([df_multi.index, df_multi.index]) + + @suppress + df_multi.index = mi + + # New output, 4-level MultiIndex + df_multi.set_index([df_multi.index, df_multi.index]) MultiIndexing Using Slicers @@ -248,6 +291,9 @@ Enhancements using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) - Added a ``to_julian_date`` function to ``TimeStamp`` and ``DatetimeIndex`` to convert to the Julian Date used primarily in astronomy. (:issue:`4041`) +- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types + and will upcast when needed. When it isn't possibly to losslessly upcast, a warning + is raised (:issue:`6327`) Performance ~~~~~~~~~~~ diff --git a/pandas/core/common.py b/pandas/core/common.py index 69addea1c4188..eb3c159ae916d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -124,7 +124,7 @@ def isnull(obj): See also -------- - pandas.notnull: boolean inverse of pandas.isnull + pandas.notnull: boolean inverse of pandas.isnull """ return _isnull(obj) @@ -272,7 +272,7 @@ def notnull(obj): isnulled : array-like of bool or bool Array or bool indicating whether an object is *not* null or if an array is given which of the element is *not* null. - + See also -------- pandas.isnull : boolean inverse of pandas.notnull @@ -1727,10 +1727,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): dtype = value.dtype if dtype.kind == 'M' and dtype != _NS_DTYPE: - try: - value = tslib.array_to_datetime(value) - except: - raise + value = value.astype(_NS_DTYPE) elif dtype.kind == 'm' and dtype != _TD_DTYPE: from pandas.tseries.timedeltas import \ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6c1037f018e02..4c02c8abab353 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1867,11 +1867,6 @@ def eval(self, expr, **kwargs): kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - axis = self._get_block_manager_axis(axis) - new_data = self._data.get_slice( - slobj, axis=axis, raise_on_error=raise_on_error) - return self._constructor(new_data) def _box_item_values(self, key, values): items = self.columns[self.columns.get_loc(key)] @@ -2240,7 +2235,15 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, Series): + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): + arrays.append(col.get_level_values(n)) + + level = col.get_level_values(col.nlevels - 1) + names.extend(col.names) + elif isinstance(col, (Series, Index)): level = col.values names.append(col.name) elif isinstance(col, (list, np.ndarray)): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8ca397eda17e9..120e03e9962d8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1079,6 +1079,16 @@ def _clear_item_cache(self, i=None): else: self._item_cache.clear() + def _slice(self, slobj, axis=0, typ=None): + """ + Construct a slice of this container. + + typ parameter is maintained for compatibility with Series slicing. + + """ + axis = self._get_block_manager_axis(axis) + return self._constructor(self._data.get_slice(slobj, axis=axis)) + def _set_item(self, key, value): self._data.set(key, value) self._clear_item_cache() diff --git a/pandas/core/index.py b/pandas/core/index.py index c16e2eff06904..30e18d239d950 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -555,6 +555,29 @@ def _convert_list_indexer(self, key, typ=None): """ convert a list indexer. these should be locations """ return key + def _convert_list_indexer_for_mixed(self, keyarr, typ=None): + """ passed a key that is tuplesafe that is integer based + and we have a mixed index (e.g. number/labels). figure out + the indexer. return None if we can't help + """ + if com.is_integer_dtype(keyarr) and not self.is_floating(): + if self.inferred_type != 'integer': + keyarr = np.where(keyarr < 0, + len(self) + keyarr, keyarr) + + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + + from pandas.core.indexing import _maybe_convert_indices + return _maybe_convert_indices(indexer, len(self)) + + elif not self.inferred_type == 'integer': + return keyarr + + return None + def _convert_indexer_error(self, key, msg=None): if msg is None: msg = 'label' @@ -987,8 +1010,13 @@ def intersection(self, other): except TypeError: pass - indexer = self.get_indexer(other.values) - indexer = indexer.take((indexer != -1).nonzero()[0]) + try: + indexer = self.get_indexer(other.values) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except: + # duplicates + indexer = self.get_indexer_non_unique(other.values)[0].unique() + return self.take(indexer) def diff(self, other): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 6691db5f35bb4..e3cbddebb6643 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -91,32 +91,8 @@ def _get_label(self, label, axis=0): def _get_loc(self, key, axis=0): return self.obj._ixs(key, axis=axis) - def _slice(self, obj, axis=0, raise_on_error=False, typ=None): - - # make out-of-bounds into bounds of the object - if typ == 'iloc': - ax = self.obj._get_axis(axis) - l = len(ax) - start = obj.start - stop = obj.stop - step = obj.step - if start is not None: - # degenerate to return nothing - if start >= l: - return self._getitem_axis(tuple(),axis=axis) - - # equiv to a null slice - elif start <= -l: - start = None - if stop is not None: - if stop > l: - stop = None - elif stop <= -l: - stop = None - obj = slice(start,stop,step) - - return self.obj._slice(obj, axis=axis, raise_on_error=raise_on_error, - typ=typ) + def _slice(self, obj, axis=0, typ=None): + return self.obj._slice(obj, axis=axis, typ=typ) def __setitem__(self, key, value): @@ -441,7 +417,9 @@ def can_do_equal_len(): # align to if item in value: v = value[item] - v = v.reindex(self.obj[item].index & v.index) + i = self.obj[item].index + v = v.reindex(i & v.index) + setter(item, v.values) else: setter(item, np.nan) @@ -909,20 +887,10 @@ def _reindex(keys, level=None): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if is_integer_dtype(keyarr) and not labels.is_floating(): - if labels.inferred_type != 'integer': - keyarr = np.where(keyarr < 0, - len(labels) + keyarr, keyarr) - - if labels.inferred_type == 'mixed-integer': - indexer = labels.get_indexer(keyarr) - if (indexer >= 0).all(): - self.obj.take(indexer, axis=axis, convert=True) - else: - return self.obj.take(keyarr, axis=axis) - elif not labels.inferred_type == 'integer': - - return self.obj.take(keyarr, axis=axis) + # handle a mixed integer scenario + indexer = labels._convert_list_indexer_for_mixed(keyarr, typ=self.name) + if indexer is not None: + return self.obj.take(indexer, axis=axis) # this is not the most robust, but... if (isinstance(labels, MultiIndex) and @@ -1062,11 +1030,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): objarr = _asarray_tuplesafe(obj) # If have integer labels, defer to label-based indexing - if is_integer_dtype(objarr) and not is_int_index: - if labels.inferred_type != 'integer': - objarr = np.where(objarr < 0, - len(labels) + objarr, objarr) - return objarr + indexer = labels._convert_list_indexer_for_mixed(objarr, typ=self.name) + if indexer is not None: + return indexer # this is not the most robust, but... if (isinstance(labels, MultiIndex) and @@ -1353,8 +1319,7 @@ def _get_slice_axis(self, slice_obj, axis=0): return obj if isinstance(slice_obj, slice): - return self._slice(slice_obj, axis=axis, raise_on_error=True, - typ='iloc') + return self._slice(slice_obj, axis=axis, typ='iloc') else: return self.obj.take(slice_obj, axis=axis, convert=False) @@ -1657,18 +1622,6 @@ def _need_slice(obj): (obj.step is not None and obj.step != 1)) -def _check_slice_bounds(slobj, values): - l = len(values) - start = slobj.start - if start is not None: - if start < -l or start > l - 1: - raise IndexError("out-of-bounds on slice (start)") - stop = slobj.stop - if stop is not None: - if stop < -l - 1 or stop > l: - raise IndexError("out-of-bounds on slice (end)") - - def _maybe_droplevels(index, key): # drop levels original_index = index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 74a8ce0118d88..39eb03eebdb8c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,8 +14,7 @@ _values_from_object, _is_null_datelike_scalar) from pandas.core.index import (Index, MultiIndex, _ensure_index, _handle_legacy_indexes) -from pandas.core.indexing import (_check_slice_bounds, _maybe_convert_indices, - _length_of_indexer) +from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib @@ -61,8 +60,8 @@ def __init__(self, values, items, ref_items, ndim=None, fastpath=False, raise ValueError('Wrong number of dimensions') if len(items) != len(values): - raise ValueError('Wrong number of items passed %d, indices imply ' - '%d' % (len(items), len(values))) + raise ValueError('Wrong number of items passed %d, index implies ' + '%d' % (len(values), len(items))) self.set_ref_locs(placement) self.values = values @@ -133,6 +132,12 @@ def take_ref_locs(self, indexer): tindexer[indexer] = False tindexer = tindexer.astype(int).cumsum()[indexer] ref_locs = ref_locs[indexer] + + # Make sure the result is a copy, or otherwise self._ref_locs will be + # updated. + if ref_locs.base is not None: + ref_locs = ref_locs.copy() + ref_locs -= tindexer return ref_locs @@ -2663,12 +2668,9 @@ def combine(self, blocks): new_axes[0] = new_items return self.__class__(new_blocks, new_axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0, raise_on_error=False): + def get_slice(self, slobj, axis=0): new_axes = list(self.axes) - if raise_on_error: - _check_slice_bounds(slobj, new_axes[axis]) - new_axes[axis] = new_axes[axis][slobj] if axis == 0: @@ -3733,9 +3735,7 @@ def _delete_from_block(self, i, item): ) self._values = self._block.values - def get_slice(self, slobj, raise_on_error=False): - if raise_on_error: - _check_slice_bounds(slobj, self.index) + def get_slice(self, slobj): return self.__class__(self._block._slice(slobj), self.index[slobj], fastpath=True) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index eba526f574375..2bf50bb1bf142 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -539,12 +539,6 @@ def _box_item_values(self, key, values): d = self._construct_axes_dict_for_slice(self._AXIS_ORDERS[1:]) return self._constructor_sliced(values, **d) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - new_data = self._data.get_slice(slobj, - axis=axis, - raise_on_error=raise_on_error) - return self._constructor(new_data) - def __setitem__(self, key, value): shape = tuple(self.shape) if isinstance(value, self._constructor_sliced): diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e6c0bd9305ab..4fc7ced6e8900 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -28,7 +28,7 @@ from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) from pandas.core.indexing import ( - _check_bool_indexer, _check_slice_bounds, + _check_bool_indexer, _is_index_slice, _maybe_convert_indices) from pandas.core import generic, base from pandas.core.internals import SingleBlockManager @@ -469,9 +469,7 @@ def _ixs(self, i, axis=0): def _is_mixed_type(self): return False - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): - if raise_on_error: - _check_slice_bounds(slobj, self.values) + def _slice(self, slobj, axis=0, typ=None): slobj = self.index._convert_slice_indexer(slobj, typ=typ or 'getitem') return self._get_values(slobj) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 55bcbd76c2248..2ecdb22a5cc7b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -23,7 +23,7 @@ from pandas.compat import long, lrange, lmap, lzip from pandas import isnull from pandas.io.common import get_filepath_or_buffer - +from pandas.tslib import NaT def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): @@ -48,7 +48,7 @@ def read_stata(filepath_or_buffer, convert_dates=True, return reader.data(convert_dates, convert_categoricals, index) -_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] +_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] def _stata_elapsed_date_to_datetime(date, fmt): @@ -97,6 +97,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): # numpy types and numpy datetime isn't mature enough / we can't rely on # pandas version > 0.7.1 #TODO: IIRC relative delta doesn't play well with np.datetime? + #TODO: When pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly if np.isnan(date): return np.datetime64('nat') @@ -109,7 +110,7 @@ def _stata_elapsed_date_to_datetime(date, fmt): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") return date - elif fmt in ["%td", "td"]: + elif fmt in ["%td", "td", "%d", "d"]: return stata_epoch + datetime.timedelta(int(date)) elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) @@ -150,6 +151,11 @@ def _datetime_to_stata_elapsed(date, fmt): if not isinstance(date, datetime.datetime): raise ValueError("date should be datetime.datetime format") stata_epoch = datetime.datetime(1960, 1, 1) + # Handle NaTs + if date is NaT: + # Missing value for dates ('.'), assumed always double + # TODO: Should be moved so a const somewhere, and consolidated + return struct.unpack(' 100 or data[col].min() < -127: + data[col] = data[col].astype(np.int16) + elif dtype == np.int16: + if data[col].max() > 32740 or data[col].min() < -32767: + data[col] = data[col].astype(np.int32) + elif dtype == np.int64: + if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + data[col] = data[col].astype(np.int32) + else: + data[col] = data[col].astype(np.float64) + if data[col].max() <= 2 * 53 or data[col].min() >= -2 ** 53: + ws = precision_loss_doc % ('int64', 'float64') + + if ws: + import warnings + + warnings.warn(ws, PossiblePrecisionLoss) + + return data + + class StataMissingValue(StringMixin): """ An observation's missing value. @@ -193,14 +255,23 @@ class StataMissingValue(StringMixin): ----- More information: """ - + # TODO: Needs test def __init__(self, offset, value): self._value = value - if type(value) is int or type(value) is long: - self._str = value - offset is 1 and \ - '.' or ('.' + chr(value - offset + 96)) + value_type = type(value) + if value_type in int: + loc = value - offset + elif value_type in (float, np.float32, np.float64): + if value <= np.finfo(np.float32).max: # float32 + conv_str, byte_loc, scale = ' nmax: if self._missing_values: return StataMissingValue(nmax, d) @@ -855,11 +942,12 @@ def _dtype_to_stata_type(dtype): See TYPE_MAP and comments for an explanation. This is also explained in the dta spec. 1 - 244 are strings of this length - 251 - chr(251) - for int8 and int16, byte - 252 - chr(252) - for int32, int - 253 - chr(253) - for int64, long - 254 - chr(254) - for float32, float - 255 - chr(255) - double, double + Pandas Stata + 251 - chr(251) - for int8 byte + 252 - chr(252) - for int16 int + 253 - chr(253) - for int32 long + 254 - chr(254) - for float32 float + 255 - chr(255) - for double double If there are dates to convert, then dtype will already have the correct type inserted. @@ -878,8 +966,10 @@ def _dtype_to_stata_type(dtype): elif dtype == np.int64: return chr(253) elif dtype == np.int32: + return chr(253) + elif dtype == np.int16: return chr(252) - elif dtype == np.int8 or dtype == np.int16: + elif dtype == np.int8: return chr(251) else: # pragma : no cover raise ValueError("Data type %s not currently understood. " @@ -970,7 +1060,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._file = _open_file_binary_write( fname, self._encoding or self._default_encoding ) - self.type_converters = {253: np.long, 252: int} + self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): """ @@ -990,11 +1080,14 @@ def __init__(self, data): self.data = data def __iter__(self): - for i, row in data.iterrows(): - yield row + for row in data.itertuples(): + # First element is index, so remove + yield row[1:] if self._write_index: data = data.reset_index() + # Check columns for compatbaility with stata + data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape self.data = data @@ -1181,7 +1274,7 @@ def _write_data_dates(self): self._write(var) else: if isnull(var): # this only matters for floats - var = MISSING_VALUES[typ] + var = MISSING_VALUES[TYPE_MAP[typ]] self._file.write(struct.pack(byteorder+TYPE_MAP[typ], var)) def _null_terminate(self, s, as_string=False): diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1_114.dta similarity index 100% rename from pandas/io/tests/data/stata1.dta rename to pandas/io/tests/data/stata1_114.dta diff --git a/pandas/io/tests/data/stata1_v13.dta b/pandas/io/tests/data/stata1_117.dta similarity index 100% rename from pandas/io/tests/data/stata1_v13.dta rename to pandas/io/tests/data/stata1_117.dta diff --git a/pandas/io/tests/data/stata2_113.dta b/pandas/io/tests/data/stata2_113.dta new file mode 100644 index 0000000000000..09c90dca943d1 Binary files /dev/null and b/pandas/io/tests/data/stata2_113.dta differ diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2_114.dta similarity index 100% rename from pandas/io/tests/data/stata2.dta rename to pandas/io/tests/data/stata2_114.dta diff --git a/pandas/io/tests/data/stata2_115.dta b/pandas/io/tests/data/stata2_115.dta new file mode 100644 index 0000000000000..ad7dda3fdc4b3 Binary files /dev/null and b/pandas/io/tests/data/stata2_115.dta differ diff --git a/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..ad7dda3fdc4b3 Binary files /dev/null and b/pandas/io/tests/data/stata2_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata2_v13.dta b/pandas/io/tests/data/stata2_117.dta similarity index 100% rename from pandas/io/tests/data/stata2_v13.dta rename to pandas/io/tests/data/stata2_117.dta diff --git a/pandas/io/tests/data/stata3_113.dta b/pandas/io/tests/data/stata3_113.dta new file mode 100644 index 0000000000000..f78150a2e4326 Binary files /dev/null and b/pandas/io/tests/data/stata3_113.dta differ diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3_114.dta similarity index 100% rename from pandas/io/tests/data/stata3.dta rename to pandas/io/tests/data/stata3_114.dta diff --git a/pandas/io/tests/data/stata3_115.dta b/pandas/io/tests/data/stata3_115.dta new file mode 100644 index 0000000000000..1c4ad0dae8092 Binary files /dev/null and b/pandas/io/tests/data/stata3_115.dta differ diff --git a/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..1c4ad0dae8092 Binary files /dev/null and b/pandas/io/tests/data/stata3_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata3_v13.dta b/pandas/io/tests/data/stata3_117.dta similarity index 100% rename from pandas/io/tests/data/stata3_v13.dta rename to pandas/io/tests/data/stata3_117.dta diff --git a/pandas/io/tests/data/stata4_113.dta b/pandas/io/tests/data/stata4_113.dta new file mode 100644 index 0000000000000..9d7d5abb1b921 Binary files /dev/null and b/pandas/io/tests/data/stata4_113.dta differ diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4_114.dta similarity index 100% rename from pandas/io/tests/data/stata4.dta rename to pandas/io/tests/data/stata4_114.dta diff --git a/pandas/io/tests/data/stata4_115.dta b/pandas/io/tests/data/stata4_115.dta new file mode 100644 index 0000000000000..2c68cfb393b9e Binary files /dev/null and b/pandas/io/tests/data/stata4_115.dta differ diff --git a/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats new file mode 100644 index 0000000000000..2c68cfb393b9e Binary files /dev/null and b/pandas/io/tests/data/stata4_115.dta~1dc157c... Added additional data files for testing alternative Stata file formats differ diff --git a/pandas/io/tests/data/stata4_v13.dta b/pandas/io/tests/data/stata4_117.dta similarity index 100% rename from pandas/io/tests/data/stata4_v13.dta rename to pandas/io/tests/data/stata4_117.dta diff --git a/pandas/io/tests/data/stata5.csv b/pandas/io/tests/data/stata5.csv new file mode 100644 index 0000000000000..8eb0c2854a740 --- /dev/null +++ b/pandas/io/tests/data/stata5.csv @@ -0,0 +1,19 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,,"a","a" +1,1,1,1,1,,"ab","b" +-1,-1,-1,-1,-1,,"abc","c" +100,32740,-2147483647,-1.70100000027769e+38,-2.0000000000000e+307,1970-01-01,"abcdefghijklmnop","d" +-127,-32767,2147483620,1.70100000027769e+38,8.0000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,2014-01-01,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,2114-01-01,"1234567890","1" +,,0,,,2014-12-31,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,2012-02-29,"!","A" +100,32740,-2.15e+09,-1.70e+38,-2.0e+307,01jan1970,"abcdefghijklmnop","d" +-127,-32767,2.15e+09,1.70e+38,8.0e+307,02jan1970,"abcdefghijklmnopqrstuvwxyz","e" +,0,,,,01jan2014,"ABCDEFGHIJKLMNOPQRSTUVWXYZ","f" +0,,,,,01jan2114,"1234567890","1" +,,0,,,31dec2014,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","2" +.a,.a,.a,.a,.a,29feb2012,"!","A" +.z,.z,.z,.z,.z,,"&","Z" +,,,0,,,"1.23","!" +,,,,0,,"10jan1970","." diff --git a/pandas/io/tests/data/stata5_113.dta b/pandas/io/tests/data/stata5_113.dta new file mode 100644 index 0000000000000..3615928d55838 Binary files /dev/null and b/pandas/io/tests/data/stata5_113.dta differ diff --git a/pandas/io/tests/data/stata5_114.dta b/pandas/io/tests/data/stata5_114.dta new file mode 100644 index 0000000000000..bebc6a72c5e34 Binary files /dev/null and b/pandas/io/tests/data/stata5_114.dta differ diff --git a/pandas/io/tests/data/stata5_115.dta b/pandas/io/tests/data/stata5_115.dta new file mode 100644 index 0000000000000..c54bd62c24dd2 Binary files /dev/null and b/pandas/io/tests/data/stata5_115.dta differ diff --git a/pandas/io/tests/data/stata6.csv b/pandas/io/tests/data/stata6.csv new file mode 100644 index 0000000000000..27a1dc64f530b --- /dev/null +++ b/pandas/io/tests/data/stata6.csv @@ -0,0 +1,6 @@ +byte_,int_,long_,float_,double_,date_td,string_,string_1 +0,0,0,0,0,1960-01-01,"a","a" +1,1,1,1,1,3014-12-31,"ab","b" +-1,-1,-1,-1,-1,2014-12-31,"abc","c" +100,32740,-2147483647,-1.7010000002777e+38,-2.000000000000e+307,1970-01-01,"This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted by Stata. This string has 244 characters, so that ir is the maximum length permitted","d" +-127,-32767,2147483620,1.7010000002777e+38,8.000000000000e+307,1970-01-02,"abcdefghijklmnopqrstuvwxyz","e" diff --git a/pandas/io/tests/data/stata6_113.dta b/pandas/io/tests/data/stata6_113.dta new file mode 100644 index 0000000000000..2e4795b167f26 Binary files /dev/null and b/pandas/io/tests/data/stata6_113.dta differ diff --git a/pandas/io/tests/data/stata6_114.dta b/pandas/io/tests/data/stata6_114.dta new file mode 100644 index 0000000000000..aa507e474dd43 Binary files /dev/null and b/pandas/io/tests/data/stata6_114.dta differ diff --git a/pandas/io/tests/data/stata6_115.dta b/pandas/io/tests/data/stata6_115.dta new file mode 100644 index 0000000000000..c513463868113 Binary files /dev/null and b/pandas/io/tests/data/stata6_115.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 1640bee7a9929..ac4b9662fc57e 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -27,22 +27,46 @@ def setUp(self): # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: # http://stata-press.com/data/glmext.html self.dirpath = tm.get_data_path() - self.dta1 = os.path.join(self.dirpath, 'stata1.dta') - self.dta2 = os.path.join(self.dirpath, 'stata2.dta') - self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') + self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') + + self.dta2_113 = os.path.join(self.dirpath, 'stata2_113.dta') + self.dta2_114 = os.path.join(self.dirpath, 'stata2_114.dta') + self.dta2_115 = os.path.join(self.dirpath, 'stata2_115.dta') + self.dta2_117 = os.path.join(self.dirpath, 'stata2_117.dta') + + self.dta3_113 = os.path.join(self.dirpath, 'stata3_113.dta') + self.dta3_114 = os.path.join(self.dirpath, 'stata3_114.dta') + self.dta3_115 = os.path.join(self.dirpath, 'stata3_115.dta') + self.dta3_117 = os.path.join(self.dirpath, 'stata3_117.dta') self.csv3 = os.path.join(self.dirpath, 'stata3.csv') - self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + + self.dta4_113 = os.path.join(self.dirpath, 'stata4_113.dta') + self.dta4_114 = os.path.join(self.dirpath, 'stata4_114.dta') + self.dta4_115 = os.path.join(self.dirpath, 'stata4_115.dta') + self.dta4_117 = os.path.join(self.dirpath, 'stata4_117.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta') - self.dta1_13 = os.path.join(self.dirpath, 'stata1_v13.dta') - self.dta2_13 = os.path.join(self.dirpath, 'stata2_v13.dta') - self.dta3_13 = os.path.join(self.dirpath, 'stata3_v13.dta') - self.dta4_13 = os.path.join(self.dirpath, 'stata4_v13.dta') + + self.csv14 = os.path.join(self.dirpath, 'stata5.csv') + self.dta14_113 = os.path.join(self.dirpath, 'stata5_113.dta') + self.dta14_114 = os.path.join(self.dirpath, 'stata5_114.dta') + self.dta14_115 = os.path.join(self.dirpath, 'stata5_115.dta') + + self.csv15 = os.path.join(self.dirpath, 'stata6.csv') + self.dta15_113 = os.path.join(self.dirpath, 'stata6_113.dta') + self.dta15_114 = os.path.join(self.dirpath, 'stata6_114.dta') + self.dta15_115 = os.path.join(self.dirpath, 'stata6_115.dta') def read_dta(self, file): return read_stata(file, convert_dates=True) @@ -51,10 +75,10 @@ def read_csv(self, file): return read_csv(file, parse_dates=True) def test_read_dta1(self): - reader = StataReader(self.dta1) - parsed = reader.data() - reader_13 = StataReader(self.dta1_13) - parsed_13 = reader_13.data() + reader_114 = StataReader(self.dta1_114) + parsed_114 = reader_114.data() + reader_117 = StataReader(self.dta1_117) + parsed_117 = reader_117.data() # Pandas uses np.nan as missing value. # Thus, all columns will be of type float, regardless of their name. expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], @@ -65,8 +89,8 @@ def test_read_dta1(self): # the casting doesn't fail so need to match stata here expected['float_miss'] = expected['float_miss'].astype(np.float32) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta2(self): if LooseVersion(sys.version) < '2.7': @@ -109,34 +133,48 @@ def test_read_dta2(self): 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] ) + expected['yearly_date'] = expected['yearly_date'].astype('O') with warnings.catch_warnings(record=True) as w: - parsed = self.read_dta(self.dta2) - parsed_13 = self.read_dta(self.dta2_13) + parsed_114 = self.read_dta(self.dta2_114) + parsed_115 = self.read_dta(self.dta2_115) + parsed_117 = self.read_dta(self.dta2_117) + # 113 is buggy due ot limits date format support in Stata + # parsed_113 = self.read_dta(self.dta2_113) + np.testing.assert_equal( len(w), 1) # should get a warning for that format. # buggy test because of the NaT comparison on certain platforms - # - #tm.assert_frame_equal(parsed, expected) - #tm.assert_frame_equal(parsed_13, expected) + # Format 113 test fails since it does not support tc and tC formats + # tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta3(self): - parsed = self.read_dta(self.dta3) - parsed_13 = self.read_dta(self.dta3_13) + parsed_113 = self.read_dta(self.dta3_113) + parsed_114 = self.read_dta(self.dta3_114) + parsed_115 = self.read_dta(self.dta3_115) + parsed_117 = self.read_dta(self.dta3_117) # match stata here expected = self.read_csv(self.csv3) expected = expected.astype(np.float32) - expected['year'] = expected['year'].astype(np.int32) - expected['quarter'] = expected['quarter'].astype(np.int16) + expected['year'] = expected['year'].astype(np.int16) + expected['quarter'] = expected['quarter'].astype(np.int8) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_dta4(self): - parsed = self.read_dta(self.dta4) - parsed_13 = self.read_dta(self.dta4_13) + parsed_113 = self.read_dta(self.dta4_113) + parsed_114 = self.read_dta(self.dta4_114) + parsed_115 = self.read_dta(self.dta4_115) + parsed_117 = self.read_dta(self.dta4_117) + expected = DataFrame.from_records( [ ["one", "ten", "one", "one", "one"], @@ -153,11 +191,13 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) - tm.assert_frame_equal(parsed, expected) - tm.assert_frame_equal(parsed_13, expected) + tm.assert_frame_equal(parsed_113, expected) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) def test_read_write_dta5(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], columns=['float_miss', 'double_miss', 'byte_miss', @@ -171,10 +211,13 @@ def test_read_write_dta5(self): original) def test_write_dta6(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = self.read_csv(self.csv3) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['year'] = original['year'].astype(np.int32) + original['quarter'] = original['quarter'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, None, False) @@ -201,7 +244,7 @@ def test_read_dta9(self): tm.assert_frame_equal(parsed, expected) def test_read_write_dta10(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], @@ -209,6 +252,8 @@ def test_read_write_dta10(self): 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' + original.index = original.index.astype(np.int32) + original['integer'] = original['integer'].astype(np.int32) with tm.ensure_clean() as path: original.to_stata(path, {'datetime': 'tc'}, False) @@ -238,13 +283,14 @@ def test_encoding(self): self.assert_(isinstance(result, unicode)) def test_read_write_dta11(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -256,13 +302,14 @@ def test_read_write_dta11(self): tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): - skip_if_not_little_endian() + # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) formatted = DataFrame([(1, 2, 3, 4)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) formatted.index.name = 'index' + formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: @@ -272,6 +319,64 @@ def test_read_write_dta12(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + + def test_read_write_dta13(self): + s1 = Series(2**9,dtype=np.int16) + s2 = Series(2**17,dtype=np.int32) + s3 = Series(2**33,dtype=np.int64) + original = DataFrame({'int16':s1,'int32':s2,'int64':s3}) + original.index.name = 'index' + + formatted = original + formatted['int64'] = formatted['int64'].astype(np.float64) + + with tm.ensure_clean() as path: + original.to_stata(path) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), + formatted) + + def test_read_write_reread_dta14(self): + expected = self.read_csv(self.csv14) + cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] + for col in cols: + expected[col] = expected[col].convert_objects(convert_numeric=True) + expected['float_'] = expected['float_'].astype(np.float32) + expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) + + parsed_113 = self.read_dta(self.dta14_113) + parsed_113.index.name = 'index' + parsed_114 = self.read_dta(self.dta14_114) + parsed_114.index.name = 'index' + parsed_115 = self.read_dta(self.dta14_115) + parsed_115.index.name = 'index' + + tm.assert_frame_equal(parsed_114, parsed_113) + tm.assert_frame_equal(parsed_114, parsed_115) + + with tm.ensure_clean() as path: + parsed_114.to_stata(path, {'date_td': 'td'}, write_index=False) + written_and_read_again = self.read_dta(path) + tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + + def test_read_write_reread_dta15(self): + expected = self.read_csv(self.csv15) + expected['byte_'] = expected['byte_'].astype(np.int8) + expected['int_'] = expected['int_'].astype(np.int16) + expected['long_'] = expected['long_'].astype(np.int32) + expected['float_'] = expected['float_'].astype(np.float32) + expected['double_'] = expected['double_'].astype(np.float64) + expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + + parsed_113 = self.read_dta(self.dta15_113) + parsed_114 = self.read_dta(self.dta15_114) + parsed_115 = self.read_dta(self.dta15_115) + + tm.assert_frame_equal(expected, parsed_114) + tm.assert_frame_equal(parsed_113, parsed_114) + tm.assert_frame_equal(parsed_114, parsed_115) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 6e76155619c09..a69c07494af8a 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -13,7 +13,7 @@ from pandas.core.common import (isnull, notnull, _pickle_array, _unpickle_array, _try_sort) from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices +from pandas.core.indexing import _maybe_convert_indices from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -379,15 +379,11 @@ def set_value(self, index, col, value, takeable=False): return dense.to_sparse(kind=self._default_kind, fill_value=self._default_fill_value) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): if axis == 0: - if raise_on_error: - _check_slice_bounds(slobj, self.index) new_index = self.index[slobj] new_columns = self.columns else: - if raise_on_error: - _check_slice_bounds(slobj, self.columns) new_index = self.index new_columns = self.columns[slobj] diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py index 86dcf97c8bd3d..20bbc58cc908f 100644 --- a/pandas/sparse/panel.py +++ b/pandas/sparse/panel.py @@ -187,7 +187,7 @@ def _ixs(self, i, axis=0): return self.xs(key, axis=axis) - def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): + def _slice(self, slobj, axis=0, typ=None): """ for compat as we don't support Block Manager here """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 3c39d610c1b88..1cc357ce2a260 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12267,6 +12267,46 @@ def test_empty_frame_dtypes_ftypes(self): ('b', 'bool:dense'), ('c', 'float64:dense')]))) + def test_dtypes_are_correct_after_column_slice(self): + # GH6525 + df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) + odict = OrderedDict + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + assert_series_equal(df.iloc[:,2:].dtypes, + pd.Series(odict([('c', np.float_)]))) + assert_series_equal(df.dtypes, + pd.Series(odict([('a', np.float_), ('b', np.float_), + ('c', np.float_),]))) + + def test_set_index_names(self): + df = pd.util.testing.makeDataFrame() + df.index.name = 'name' + + self.assertEquals(df.set_index(df.index).index.names, ['name']) + + mi = MultiIndex.from_arrays(df[['A', 'B']].T.values, names=['A', 'B']) + mi2 = MultiIndex.from_arrays(df[['A', 'B', 'A', 'B']].T.values, + names=['A', 'B', 'A', 'B']) + + df = df.set_index(['A', 'B']) + + self.assertEquals(df.set_index(df.index).index.names, ['A', 'B']) + + # Check that set_index isn't converting a MultiIndex into an Index + self.assertTrue(isinstance(df.set_index(df.index).index, MultiIndex)) + + # Check actual equality + tm.assert_index_equal(df.set_index(df.index).index, mi) + + # Check that [MultiIndex, MultiIndex] yields a MultiIndex rather + # than a pair of tuples + self.assertTrue(isinstance(df.set_index([df.index, df.index]).index, MultiIndex)) + + # Check equality + tm.assert_index_equal(df.set_index([df.index, df.index]).index, mi2) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index d373f2f43ad3e..325d770fb62c9 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -393,12 +393,36 @@ def test_iloc_exceeds_bounds(self): self.assertRaises(IndexError, lambda : df.iloc[-30]) # slices are ok - result = df.iloc[:,4:10] + result = df.iloc[:,4:10] # 0 < start < len < stop expected = df.iloc[:,4:] assert_frame_equal(result,expected) - result = df.iloc[:,-4:-10] - expected = df.iloc[:,-4:] + result = df.iloc[:,-4:-10] # stop < 0 < start < len + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4:-1] # 0 < stop < len < start (down) + expected = df.iloc[:,:4:-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,4:-10:-1] # stop < 0 < start < len (down) + expected = df.iloc[:,4::-1] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:4] # start < 0 < stop < len + expected = df.iloc[:,:4] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:4] # 0 < stop < len < start + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,-10:-11:-1] # stop < start < 0 < len (down) + expected = df.iloc[:,:0] + assert_frame_equal(result,expected) + + result = df.iloc[:,10:11] # 0 < len < start < stop + expected = df.iloc[:,:0] assert_frame_equal(result,expected) # slice bounds exceeding is ok @@ -406,6 +430,14 @@ def test_iloc_exceeds_bounds(self): expected = s.iloc[18:] assert_series_equal(result,expected) + result = s.iloc[30:] + expected = s.iloc[:0] + assert_series_equal(result,expected) + + result = s.iloc[30::-1] + expected = s.iloc[::-1] + assert_series_equal(result,expected) + # doc example def check(result,expected): str(result) @@ -564,6 +596,29 @@ def test_loc_setitem(self): expected = DataFrame({'a' : [0.5,-0.5,-1.5], 'b' : [0,1,2] }) assert_frame_equal(df,expected) + def test_loc_setitem_dups(self): + + # GH 6541 + df_orig = DataFrame({'me' : list('rttti'), + 'foo': list('aaade'), + 'bar': np.arange(5,dtype='float64')*1.34+2, + 'bar2': np.arange(5,dtype='float64')*-.34+2}).set_index('me') + + indexer = tuple(['r',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_series_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['r','bar']) + df = df_orig.copy() + df.loc[indexer]*=2.0 + self.assertEqual(df.loc[indexer],2.0*df_orig.loc[indexer]) + + indexer = tuple(['t',['bar','bar2']]) + df = df_orig.copy() + df.loc[indexer]*=2.0 + assert_frame_equal(df.loc[indexer],2.0*df_orig.loc[indexer]) + def test_chained_getitem_with_lists(self): # GH6394 @@ -812,6 +867,19 @@ def test_loc_setitem_frame(self): expected = DataFrame(dict(A = Series(val1,index=keys1), B = Series(val2,index=keys2))).reindex(index=index) assert_frame_equal(df, expected) + # GH 6546 + # setting with mixed labels + df = DataFrame({1:[1,2],2:[3,4],'a':['a','b']}) + + result = df.loc[0,[1,2]] + expected = Series([1,3],index=[1,2],dtype=object) + assert_series_equal(result,expected) + + expected = DataFrame({1:[5,2],2:[6,4],'a':['a','b']}) + df.loc[0,[1,2]] = [5,6] + assert_frame_equal(df, expected) + + def test_loc_setitem_frame_multiples(self): # multiple setting diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index bae4036a68b37..faf5341276ae5 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -600,6 +600,25 @@ def test_constructor_dtype_datetime64(self): self.assertEqual(result['a'], Timestamp('20130101')) self.assertEqual(result['b'], 1) + # GH6529 + # coerce datetime64 non-ns properly + dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') + values2 = dates.view(np.ndarray).astype('datetime64[ns]') + expected = Series(values2, dates) + + # numpy < 1.7 is very odd about astyping + if not _np_version_under1p7: + for dtype in ['s','D','ms','us','ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, dates) + assert_series_equal(result,expected) + + # leave datetime.date alone + dates2 = np.array([ d.date() for d in dates.to_pydatetime() ],dtype=object) + series1 = Series(dates2, dates) + self.assert_numpy_array_equal(series1.values,dates2) + self.assertEqual(series1.dtype,object) + def test_constructor_dict(self): d = {'a': 0., 'b': 1., 'c': 2.} result = Series(d, index=['b', 'c', 'd', 'a']) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index bc5b8dcfbd49a..a24f545901ccd 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,6 +7,7 @@ from pandas.core.api import Timestamp from pandas.tslib import period_asfreq, period_ordinal +from pandas.tseries.index import date_range from pandas.tseries.frequencies import get_freq from pandas import _np_version_under1p7 import pandas.util.testing as tm @@ -302,10 +303,32 @@ def test_period_ordinal_business_day(self): # Tuesday self.assertEqual(11418, period_ordinal(2013, 10, 8, 0, 0, 0, 0, 0, get_freq('B'))) -class TestTomeStampOps(tm.TestCase): +class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): - self.assertEqual((Timestamp(datetime.datetime(2013, 10,13)) - datetime.datetime(2013, 10,12)).days, 1) - self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10,13))).days, -1) + self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) + self.assertEqual((datetime.datetime(2013, 10, 12) - Timestamp(datetime.datetime(2013, 10, 13))).days, -1) + + def test_addition_subtraction_types(self): + # Assert on the types resulting from Timestamp +/- various date/time objects + datetime_instance = datetime.datetime(2014, 3, 4) + timedelta_instance = datetime.timedelta(seconds=1) + # build a timestamp with a frequency, since then it supports addition/subtraction of integers + timestamp_instance = date_range(datetime_instance, periods=1, freq='D')[0] + + self.assertEqual(type(timestamp_instance + 1), Timestamp) + self.assertEqual(type(timestamp_instance - 1), Timestamp) + + # Timestamp + datetime not supported, though subtraction is supported and yields timedelta + self.assertEqual(type(timestamp_instance - datetime_instance), datetime.timedelta) + + self.assertEqual(type(timestamp_instance + timedelta_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta_instance), Timestamp) + + if not _np_version_under1p7: + # Timestamp +/- datetime64 not supported, so not tested (could possibly assert error raised?) + timedelta64_instance = np.timedelta64(1, 'D') + self.assertEqual(type(timestamp_instance + timedelta64_instance), Timestamp) + self.assertEqual(type(timestamp_instance - timedelta64_instance), Timestamp) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index f065ea90473c6..9ff73e7c92fdb 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -700,11 +700,11 @@ cdef class _Timestamp(datetime): return result def __sub__(self, other): - if is_integer_object(other): - neg_other = -other - return self + neg_other - # This calling convention is required - return datetime.__sub__(self, other) + if isinstance(other, datetime): + return datetime.__sub__(self, other) + + neg_other = -other + return self + neg_other cpdef _get_field(self, field): out = get_date_field(np.array([self.value], dtype=np.int64), field)