From 4173dbf62e08a806030fa1109625d4326de6ef19 Mon Sep 17 00:00:00 2001 From: Mike Kelly Date: Wed, 22 Oct 2014 15:27:59 -0400 Subject: [PATCH 1/2] Preserve dtype in merge keys when possible --- pandas/tools/merge.py | 69 ++++++++++++++++++++++++++++++-- pandas/tools/tests/test_merge.py | 26 +++++++++++- 2 files changed, 89 insertions(+), 6 deletions(-) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3371f63db1e1c..6df8954586adf 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -280,19 +280,28 @@ def _indicator_post_merge(self, result): return result def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - # insert group keys + + consolidate = False + + left_has_missing = None + right_has_missing = None keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue + take_left, take_right = None, None + if name in result: +<<<<<<< HEAD key_indexer = result.columns.get_loc(name) +======= +>>>>>>> e79b978... Preserve dtype in merge keys when possible if left_indexer is not None and right_indexer is not None: - if name in self.left: +<<<<<<< HEAD if len(self.left) == 0: continue @@ -316,12 +325,60 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): result.iloc[na_indexer, key_indexer] = ( algos.take_1d(self.left_join_keys[i], left_na_indexer)) +======= + + if left_has_missing is None: + left_has_missing = any(left_indexer == -1) + + if left_has_missing: + take_right = self.right_join_keys[i] + + if result[name].dtype != self.left[name].dtype: + take_left = self.left[name].values + + elif name in self.right: + + if right_has_missing is None: + right_has_missing = any(right_indexer == -1) + + if right_has_missing: + take_left = self.left_join_keys[i] + + if result[name].dtype != self.right[name].dtype: + take_right = self.right[name].values + +>>>>>>> e79b978... Preserve dtype in merge keys when possible elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): - if name is None: - name = 'key_%d' % i + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] + + if take_left is not None or take_right is not None: + + if take_left is None: + lvals = result[name].values + else: + lfill = take_left.dtype.type() + lvals = com.take_1d(take_left, left_indexer, fill_value=lfill) + + if take_right is None: + rvals = result[name].values + else: + rfill = take_right.dtype.type() + rvals = com.take_1d(take_right, right_indexer, fill_value=rfill) + + key_col = np.where(left_indexer != -1, lvals, rvals) + + if name in result: + if result[name].dtype != key_col.dtype: + consolidate = True + result[name] = key_col + else: + result.insert(i, name or 'key_%d' % i, key_col) + consolidate = True +<<<<<<< HEAD # a faster way? key_col = algos.take_1d(self.left_join_keys[i], left_indexer) na_indexer = (left_indexer == -1).nonzero()[0] @@ -329,6 +386,10 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): key_col.put(na_indexer, algos.take_1d(self.right_join_keys[i], right_na_indexer)) result.insert(i, name, key_col) +======= + if consolidate: + result.consolidate(inplace=True) +>>>>>>> e79b978... Preserve dtype in merge keys when possible def _get_join_info(self): left_ax = self.left._data.axes[self.axis] diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 01c651d496ecd..dc04b00dc211d 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -507,8 +507,8 @@ def test_join_many_non_unique_index(self): result = result.reset_index() - result['a'] = result['a'].astype(np.float64) - result['b'] = result['b'].astype(np.float64) + # result['a'] = result['a'].astype(np.float64) + # result['b'] = result['b'].astype(np.float64) assert_frame_equal(result, expected.ix[:, result.columns]) @@ -1033,6 +1033,7 @@ def test_overlapping_columns_error_message(self): df2.columns = ['key1', 'foo', 'foo'] self.assertRaises(ValueError, merge, df, df2) +<<<<<<< HEAD def test_merge_on_datetime64tz(self): # GH11405 @@ -1426,6 +1427,27 @@ def test_indicator(self): test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) +======= + def test_merge_join_key_dtype_cast(self): + # #8596 + + df1 = DataFrame({'key': [1], 'v1': [10]}) + df2 = DataFrame({'key': [2], 'v1': [20]}) + df = merge(df1, df2, how='outer') + self.assertEqual(df['key'].dtype, 'int64') + + df1 = DataFrame({'key': [True], 'v1': [1]}) + df2 = DataFrame({'key': [False],'v1': [0]}) + df = merge(df1, df2, how='outer') + self.assertEqual(df['key'].dtype, 'bool') + + df1 = DataFrame({'val': [1]}) + df2 = DataFrame({'val': [2]}) + lkey = np.array([1]) + rkey = np.array([2]) + df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') + self.assertEqual(df['key_0'].dtype, 'int64') +>>>>>>> e79b978... Preserve dtype in merge keys when possible def _check_merge(x, y): From 0a267cf3a506fa5badf527c49aceb44c4ea0b316 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 May 2016 19:59:44 -0400 Subject: [PATCH 2/2] BUG: preserve merge keys dtypes when possible closes #8596 xref to #13169 as assignment of Index of bools not retaining dtype --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.18.2.txt | 58 ++++++++++- pandas/indexes/base.py | 18 ++++ pandas/indexes/category.py | 23 +++++ pandas/indexes/multi.py | 4 + pandas/tests/indexes/common.py | 14 ++- pandas/tests/indexes/test_category.py | 15 ++- pandas/tests/indexes/test_datetimelike.py | 67 +++++++++++- pandas/tests/indexes/test_multi.py | 8 ++ pandas/tests/types/test_types.py | 40 ++++++++ pandas/tools/merge.py | 84 +++++---------- pandas/tools/tests/test_merge.py | 118 ++++++++++++---------- pandas/tseries/base.py | 75 ++++++++++++-- pandas/types/api.py | 35 ++++++- 14 files changed, 433 insertions(+), 127 deletions(-) create mode 100644 pandas/tests/types/test_types.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 9557867c252ed..9e7ae2357c541 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1333,6 +1333,7 @@ Modifying and Computations Index.max Index.reindex Index.repeat + Index.where Index.take Index.putmask Index.set_names diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0e4d9780cb2d4..dfb5ebc9379b1 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -77,11 +77,20 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + + .. ipython:: python + + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) + - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) + + .. _whatsnew_0182.api: API changes @@ -119,7 +128,6 @@ New Behavior: type(s.tolist()[0]) - .. _whatsnew_0182.api.promote: ``Series`` type promotion on assignment @@ -171,6 +179,54 @@ This will now convert integers/floats with the default unit of ``ns``. pd.to_datetime([1, 'foo'], errors='coerce') +.. _whatsnew_0182.api.merging: + +Merging changes +^^^^^^^^^^^^^^^ + +Merging will now preserve the dtype of the join keys (:issue:`8596`) + +.. ipython:: python + + df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 + df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.merge(df1, df2, how='outer') + Out[5]: + key v1 + 0 1.0 10.0 + 1 1.0 20.0 + 2 2.0 30.0 + + In [6]: pd.merge(df1, df2, how='outer').dtypes + Out[6]: + key float64 + v1 float64 + dtype: object + +New Behavior: + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(df1, df2, how='outer') + pd.merge(df1, df2, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast (unchanged from previous). + +.. ipython:: python + + pd.merge(df1, df2, how='outer', on='key') + pd.merge(df1, df2, how='outer', on='key').dtypes + .. _whatsnew_0182.api.other: Other API changes diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c029a4a74d9d0..82f16becbd511 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -465,6 +465,24 @@ def repeat(self, n, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(n)) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy_with_infer(values, dtype=self.dtype) + def ravel(self, order='C'): """ return an ndarray of the flattened values of the underlying data diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 8f343c5de5fb6..e877e43bcc603 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -307,6 +307,29 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + + from pandas.core.categorical import Categorical + cat = Categorical(values, + categories=self.categories, + ordered=self.ordered) + return self._shallow_copy(cat, **self._get_attributes_dict()) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 9f71f9f17d835..05b2045a4850f 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1084,6 +1084,10 @@ def repeat(self, n, *args, **kwargs): for label in self.labels], names=self.names, sortorder=self.sortorder, verify_integrity=False) + def where(self, cond, other=None): + raise NotImplementedError(".where is not supported for " + "MultiIndex operations") + def drop(self, labels, level=None, errors='raise'): """ Make new MultiIndex with passed list of labels deleted diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8ea87e9d69c92..0002bd840def3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex) + TimedeltaIndex, PeriodIndex, notnull) from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -363,6 +363,18 @@ def test_numpy_repeat(self): tm.assertRaisesRegexp(ValueError, msg, np.repeat, i, rep, axis=0) + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.Index([np.nan, np.nan] + i[2:].tolist()) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 66ddcdebff83b..7fff62b822e40 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, compat +from pandas import Categorical, compat, notnull from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -230,6 +230,19 @@ def f(x): ordered=False) tm.assert_categorical_equal(result, exp) + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), + categories=i.categories) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_append(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index bd3deb8e6ed36..b3b987ceb6ab6 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -7,7 +7,7 @@ from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, NaT, Period, PeriodIndex, Series, Timedelta, TimedeltaIndex, date_range, period_range, - timedelta_range) + timedelta_range, notnull) import pandas.util.testing as tm @@ -449,6 +449,38 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, 'datetime64') self.assertRaises(ValueError, idx.astype, 'datetime64[D]') + def test_where_other(self): + + # other is ndarray or Index + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_where_tz(self): + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_get_loc(self): idx = pd.date_range('2000-01-01', periods=3) @@ -776,6 +808,39 @@ def test_get_loc(self): with tm.assertRaises(KeyError): idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + + def test_where_other(self): + + i = self.create_index() + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + def test_get_indexer(self): idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') tm.assert_numpy_array_equal(idx.get_indexer(idx), [0, 1, 2]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index b8804daa6cf19..10d87abf0d886 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -78,6 +78,14 @@ def test_labels_dtypes(self): self.assertTrue((i.labels[0] >= 0).all()) self.assertTrue((i.labels[1] >= 0).all()) + def test_where(self): + i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + + def f(): + i.where(True) + + self.assertRaises(NotImplementedError, f) + def test_repeat(self): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py new file mode 100644 index 0000000000000..b9f6006cab731 --- /dev/null +++ b/pandas/tests/types/test_types.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +import nose +import numpy as np + +from pandas import NaT +from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, + na_value_for_dtype, pandas_dtype) + + +def test_pandas_dtype(): + + assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( + 'datetime64[ns, US/Eastern]') + assert pandas_dtype('category') == CategoricalDtype() + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + assert pandas_dtype(dtype) == np.dtype(dtype) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 6df8954586adf..182c0637ae29c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -7,6 +7,7 @@ import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat + from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, _merge_doc from pandas.core.generic import NDFrame @@ -22,6 +23,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat +from pandas.types.api import na_value_for_dtype import pandas.algos as _algos import pandas.hashtable as _hash @@ -281,8 +283,6 @@ def _indicator_post_merge(self, result): def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - consolidate = False - left_has_missing = None right_has_missing = None @@ -294,38 +294,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): take_left, take_right = None, None if name in result: -<<<<<<< HEAD - key_indexer = result.columns.get_loc(name) -======= ->>>>>>> e79b978... Preserve dtype in merge keys when possible if left_indexer is not None and right_indexer is not None: if name in self.left: -<<<<<<< HEAD - if len(self.left) == 0: - continue - - na_indexer = (left_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue - - right_na_indexer = right_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.right_join_keys[i], - right_na_indexer)) - elif name in self.right: - if len(self.right) == 0: - continue - - na_indexer = (right_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue - - left_na_indexer = left_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.left_join_keys[i], - left_na_indexer)) -======= if left_has_missing is None: left_has_missing = any(left_indexer == -1) @@ -333,8 +304,9 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if left_has_missing: take_right = self.right_join_keys[i] - if result[name].dtype != self.left[name].dtype: - take_left = self.left[name].values + if not com.is_dtype_equal(result[name].dtype, + self.left[name].dtype): + take_left = self.left[name]._values elif name in self.right: @@ -344,10 +316,10 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if right_has_missing: take_left = self.left_join_keys[i] - if result[name].dtype != self.right[name].dtype: - take_right = self.right[name].values + if not com.is_dtype_equal(result[name].dtype, + self.right[name].dtype): + take_right = self.right[name]._values ->>>>>>> e79b978... Preserve dtype in merge keys when possible elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): @@ -357,39 +329,31 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if take_left is not None or take_right is not None: if take_left is None: - lvals = result[name].values + lvals = result[name]._values else: - lfill = take_left.dtype.type() - lvals = com.take_1d(take_left, left_indexer, fill_value=lfill) + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_1d(take_left, left_indexer, + fill_value=lfill) if take_right is None: - rvals = result[name].values + rvals = result[name]._values else: - rfill = take_right.dtype.type() - rvals = com.take_1d(take_right, right_indexer, fill_value=rfill) - - key_col = np.where(left_indexer != -1, lvals, rvals) + rfill = na_value_for_dtype(take_right.dtype) + rvals = algos.take_1d(take_right, right_indexer, + fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): + key_col = rvals + else: + key_col = Index(lvals).where(~mask, rvals) if name in result: - if result[name].dtype != key_col.dtype: - consolidate = True result[name] = key_col else: result.insert(i, name or 'key_%d' % i, key_col) - consolidate = True - -<<<<<<< HEAD - # a faster way? - key_col = algos.take_1d(self.left_join_keys[i], left_indexer) - na_indexer = (left_indexer == -1).nonzero()[0] - right_na_indexer = right_indexer.take(na_indexer) - key_col.put(na_indexer, algos.take_1d(self.right_join_keys[i], - right_na_indexer)) - result.insert(i, name, key_col) -======= - if consolidate: - result.consolidate(inplace=True) ->>>>>>> e79b978... Preserve dtype in merge keys when possible def _get_join_info(self): left_ax = self.left._data.axes[self.axis] diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index dc04b00dc211d..0b934d5f02b15 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -506,11 +506,10 @@ def test_join_many_non_unique_index(self): expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() - - # result['a'] = result['a'].astype(np.float64) - # result['b'] = result['b'].astype(np.float64) - - assert_frame_equal(result, expected.ix[:, result.columns]) + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) @@ -674,14 +673,35 @@ def test_intelligently_handle_join_key(self): 'rvalue': lrange(6)}) joined = merge(left, right, on='key', how='outer') - expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.], + expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], 'value': np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, columns=['value', 'key', 'rvalue']) - assert_frame_equal(joined, expected, check_dtype=False) + assert_frame_equal(joined, expected) + + def test_merge_join_key_dtype_cast(self): + # #8596 + + df1 = DataFrame({'key': [1], 'v1': [10]}) + df2 = DataFrame({'key': [2], 'v1': [20]}) + df = merge(df1, df2, how='outer') + self.assertEqual(df['key'].dtype, 'int64') + + df1 = DataFrame({'key': [True], 'v1': [1]}) + df2 = DataFrame({'key': [False], 'v1': [0]}) + df = merge(df1, df2, how='outer') + + # GH13169 + # this really should be bool + self.assertEqual(df['key'].dtype, 'object') - self.assertTrue(joined._data.is_consolidated()) + df1 = DataFrame({'val': [1]}) + df2 = DataFrame({'val': [2]}) + lkey = np.array([1]) + rkey = np.array([2]) + df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') + self.assertEqual(df['key_0'].dtype, 'int64') def test_handle_join_key_pass_array(self): left = DataFrame({'key': [1, 1, 2, 2, 3], @@ -814,20 +834,32 @@ def test_merge_left_empty_right_notempty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_index=True) + check1(exp_in, kwarg) + exp_out['a'] = [0, 1, 2] + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_on='x') + check1(exp_in, kwarg) + exp_out['a'] = np.array([np.nan] * 3, dtype=object) + check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 @@ -846,20 +878,24 @@ def test_merge_left_notempty_right_empty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) def test_merge_nosort(self): # #2098, anything to do? @@ -1033,7 +1069,6 @@ def test_overlapping_columns_error_message(self): df2.columns = ['key1', 'foo', 'foo'] self.assertRaises(ValueError, merge, df, df2) -<<<<<<< HEAD def test_merge_on_datetime64tz(self): # GH11405 @@ -1062,7 +1097,7 @@ def test_merge_on_datetime64tz(self): tz='US/Eastern')) + [pd.NaT], 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, tz='US/Eastern')), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'datetime64[ns, US/Eastern]') @@ -1094,7 +1129,7 @@ def test_merge_on_periods(self): exp_y = pd.period_range('20151011', periods=2, freq='D') expected = DataFrame({'value_x': list(exp_x) + [pd.NaT], 'value_y': [pd.NaT] + list(exp_y), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'object') @@ -1336,7 +1371,7 @@ def test_indicator(self): 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], - 'col_right': [np.nan, 2, 2, 2, 2, 2]}, dtype='float64') + 'col_right': [np.nan, 2, 2, 2, 2, 2]}) df_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only', 'right_only', 'right_only'], @@ -1415,7 +1450,7 @@ def test_indicator(self): df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) - hand_coded_result = DataFrame({'col1': [0, 1, 1, 3.0], + hand_coded_result = DataFrame({'col1': [0, 1, 1, 3], 'col2': ['a', 'b', 'x', 'y']}) hand_coded_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only'], @@ -1427,27 +1462,6 @@ def test_indicator(self): test5 = df3.merge(df4, on=['col1', 'col2'], how='outer', indicator=True) assert_frame_equal(test5, hand_coded_result) -======= - def test_merge_join_key_dtype_cast(self): - # #8596 - - df1 = DataFrame({'key': [1], 'v1': [10]}) - df2 = DataFrame({'key': [2], 'v1': [20]}) - df = merge(df1, df2, how='outer') - self.assertEqual(df['key'].dtype, 'int64') - - df1 = DataFrame({'key': [True], 'v1': [1]}) - df2 = DataFrame({'key': [False],'v1': [0]}) - df = merge(df1, df2, how='outer') - self.assertEqual(df['key'].dtype, 'bool') - - df1 = DataFrame({'val': [1]}) - df2 = DataFrame({'val': [2]}) - lkey = np.array([1]) - rkey = np.array([2]) - df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') - self.assertEqual(df['key_0'].dtype, 'int64') ->>>>>>> e79b978... Preserve dtype in merge keys when possible def _check_merge(x, y): diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index e52afa74d95e2..42631d442a990 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -9,6 +9,7 @@ from pandas.compat.numpy import function as nv import numpy as np + from pandas.core import common as com, algorithms from pandas.core.common import (is_integer, is_float, is_bool_dtype, AbstractMethodError) @@ -74,22 +75,16 @@ def _round(self, freq, rounder): unit = to_offset(freq).nanos # round the local times - if getattr(self, 'tz', None) is not None: - values = self.tz_localize(None).asi8 - else: - values = self.asi8 + values = _ensure_datetimelike_to_i8(self) + result = (unit * rounder(values / float(unit))).astype('i8') attribs = self._get_attributes_dict() if 'freq' in attribs: attribs['freq'] = None if 'tz' in attribs: attribs['tz'] = None - result = self._shallow_copy(result, **attribs) - - # reconvert to local tz - if getattr(self, 'tz', None) is not None: - result = result.tz_localize(self.tz) - return result + return self._ensure_localized( + self._shallow_copy(result, **attribs)) @Appender(_round_doc % "round") def round(self, freq, *args, **kwargs): @@ -161,6 +156,29 @@ def _evaluate_compare(self, other, op): except TypeError: return result + def _ensure_localized(self, result): + """ + ensure that we are re-localized + + This is for compat as we can then call this on all datetimelike + indexes generally (ignored for Period/Timedelta) + + Parameters + ---------- + result : DatetimeIndex / i8 ndarray + + Returns + ------- + localized DTI + """ + + # reconvert to local tz + if getattr(self, 'tz', None) is not None: + if not isinstance(result, com.ABCIndexClass): + result = self._simple_new(result) + result = result.tz_localize(self.tz) + return result + @property def _box_func(self): """ @@ -727,6 +745,27 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self.values.repeat(repeats), freq=None) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + other = _ensure_datetimelike_to_i8(other) + values = _ensure_datetimelike_to_i8(self) + result = np.where(cond, values, other).astype('i8') + + result = self._ensure_localized(result) + return self._shallow_copy(result, + **self._get_attributes_dict()) + def summary(self, name=None): """ return a summarized representation @@ -748,3 +787,19 @@ def summary(self, name=None): # display as values, not quoted result = result.replace("'", "") return result + + +def _ensure_datetimelike_to_i8(other): + """ helper for coercing an input scalar or array to i8 """ + if lib.isscalar(other) and com.isnull(other): + other = tslib.iNaT + elif isinstance(other, com.ABCIndexClass): + + # convert tz if needed + if getattr(other, 'tz', None) is not None: + other = other.tz_localize(None).asi8 + else: + other = other.asi8 + else: + other = np.array(other, copy=False).view('i8') + return other diff --git a/pandas/types/api.py b/pandas/types/api.py index bb61025a41a37..721d8d29bba8b 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -28,7 +28,11 @@ def pandas_dtype(dtype): ------- np.dtype or a pandas dtype """ - if isinstance(dtype, string_types): + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) except TypeError: @@ -40,3 +44,32 @@ def pandas_dtype(dtype): pass return np.dtype(dtype) + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + dtype compat na value + """ + + from pandas.core import common as com + from pandas import NaT + dtype = pandas_dtype(dtype) + + if (com.is_datetime64_dtype(dtype) or + com.is_datetime64tz_dtype(dtype) or + com.is_timedelta64_dtype(dtype)): + return NaT + elif com.is_float_dtype(dtype): + return np.nan + elif com.is_integer_dtype(dtype): + return 0 + elif com.is_bool_dtype(dtype): + return False + return np.nan