From d4942873c3e8a382f1f79d8ea19313f6c900ad67 Mon Sep 17 00:00:00 2001 From: Henning Sperr Date: Wed, 22 Apr 2015 09:30:49 +0900 Subject: [PATCH 1/3] FIX: interesction and union correct name chaning behavior. fixes #9943 partly #9862 --- doc/source/whatsnew/v0.16.1.txt | 346 +++++++++----------------------- pandas/core/index.py | 27 ++- pandas/tests/test_index.py | 157 ++++++++++++++- pandas/tseries/index.py | 7 +- 4 files changed, 281 insertions(+), 256 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 79a0c48238be7..0089e44fa25df 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -1,7 +1,7 @@ .. _whatsnew_0161: -v0.16.1 (May 11, 2015) ----------------------- +v0.16.1 (April ??, 2015) +------------------------ This is a minor bug-fix release from 0.16.0 and includes a a large number of bug fixes along several new features, enhancements, and performance improvements. @@ -10,27 +10,63 @@ We recommend that all users upgrade to this version. Highlights include: - Support for a ``CategoricalIndex``, a category based index, see :ref:`here ` -- New section on how-to-contribute to *pandas*, see :ref:`here ` -- Revised "Merge, join, and concatenate" documentation, including graphical examples to make it easier to understand each operations, see :ref:`here ` -- New method ``sample`` for drawing random samples from Series, DataFrames and Panels. See :ref:`here ` -- The default ``Index`` printing has changed to a more uniform format, see :ref:`here ` -- ``BusinessHour`` datetime-offset is now supported, see :ref:`here ` - -- Further enhancement to the ``.str`` accessor to make string operations easier, see :ref:`here ` .. contents:: What's new in v0.16.1 :local: :backlinks: none -.. _whatsnew_0161.enhancements: - -.. warning:: - In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable package. See :ref:`here for details ` (:issue:`8961`) +.. _whatsnew_0161.enhancements: Enhancements ~~~~~~~~~~~~ +- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`) +- Added ``StringMethods`` (.str accessor) to ``Index`` (:issue:`9068`) + + The ``.str`` accessor is now available for both ``Series`` and ``Index``. + + .. ipython:: python + + idx = Index([' jack', 'jill ', ' jesse ', 'frank']) + idx.str.strip() + + One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor + will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression + to work naturally: + + + .. ipython:: python + + idx = Index(['a1', 'a2', 'b1', 'b2']) + s = Series(range(4), index=idx) + s + idx.str.startswith('a') + s[s.index.str.startswith('a')] + +- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) + +- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) + + .. ipython:: python + + df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) + df.drop(['A', 'X'], axis=1, errors='ignore') + +- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) +- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) +- ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) + +- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) +- Allow Panel.shift with ``axis='items'`` (:issue:`9890`) + +- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) +- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) + +- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) + +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` + .. _whatsnew_0161.enhancements.categoricalindex: CategoricalIndex @@ -97,166 +133,16 @@ values NOT in the categories, similarly to how you can reindex ANY pandas index. df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))) df2.reindex(pd.Categorical(['a','e'],categories=list('abcde'))).index -See the :ref:`documentation ` for more. (:issue:`7629`, :issue:`10038`, :issue:`10039`) - -.. _whatsnew_0161.enhancements.sample: - -Sample -^^^^^^ - -Series, DataFrames, and Panels now have a new method: :meth:`~pandas.DataFrame.sample`. -The method accepts a specific number of rows or columns to return, or a fraction of the -total number or rows or columns. It also has options for sampling with or without replacement, -for passing in a column for weights for non-uniform sampling, and for setting seed values to -facilitate replication. (:issue:`2419`) - -.. ipython :: python - - example_series = Series([0,1,2,3,4,5]) - - # When no arguments are passed, returns 1 - example_series.sample() - - # One may specify either a number of rows: - example_series.sample(n=3) - - # Or a fraction of the rows: - example_series.sample(frac=0.5) - - # weights are accepted. - example_weights = [0, 0, 0.2, 0.2, 0.2, 0.4] - example_series.sample(n=3, weights=example_weights) - - # weights will also be normalized if they do not sum to one, - # and missing values will be treated as zeros. - example_weights2 = [0.5, 0, 0, 0, None, np.nan] - example_series.sample(n=1, weights=example_weights2) - - -When applied to a DataFrame, one may pass the name of a column to specify sampling weights -when sampling from rows. - -.. ipython :: python - - df = DataFrame({'col1':[9,8,7,6], 'weight_column':[0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights='weight_column') - - -.. _whatsnew_0161.enhancements.string: - -String Methods Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:ref:`Continuing from v0.16.0 `, the following -enhancements make string operations easier and more consistent with standard python string operations. - - -- Added ``StringMethods`` (``.str`` accessor) to ``Index`` (:issue:`9068`) - - The ``.str`` accessor is now available for both ``Series`` and ``Index``. - - .. ipython:: python - - idx = Index([' jack', 'jill ', ' jesse ', 'frank']) - idx.str.strip() - - One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor - will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression - to work naturally: - - .. ipython:: python - - idx = Index(['a1', 'a2', 'b1', 'b2']) - s = Series(range(4), index=idx) - s - idx.str.startswith('a') - s[s.index.str.startswith('a')] - -- The following new methods are accesible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) - - ================ =============== =============== =============== ================ - .. .. Methods .. .. - ================ =============== =============== =============== ================ - ``capitalize()`` ``swapcase()`` ``normalize()`` ``partition()`` ``rpartition()`` - ``index()`` ``rindex()`` ``translate()`` - ================ =============== =============== =============== ================ - -- ``split`` now takes ``expand`` keyword to specify whether to expand dimensionality. ``return_type`` is deprecated. (:issue:`9847`) - - .. ipython:: python - - s = Series(['a,b', 'a,c', 'b,c']) - - # return Series - s.str.split(',') - - # return DataFrame - s.str.split(',', expand=True) - - idx = Index(['a,b', 'a,c', 'b,c']) - - # return Index - idx.str.split(',') - - # return MultiIndex - idx.str.split(',', expand=True) - +See the :ref:`documentation ` for more. (:issue:`7629`) -- Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) - - -.. _whatsnew_0161.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ - -- ``BusinessHour`` offset is now supported, which represents business hours starting from 09:00 - 17:00 on ``BusinessDay`` by default. See :ref:`Here ` for details. (:issue:`7905`) - - .. ipython:: python - - from pandas.tseries.offsets import BusinessHour - Timestamp('2014-08-01 09:00') + BusinessHour() - Timestamp('2014-08-01 07:00') + BusinessHour() - Timestamp('2014-08-01 16:30') + BusinessHour() - -- ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) - -- Allow ``clip``, ``clip_lower``, and ``clip_upper`` to accept array-like arguments as thresholds (This is a regression from 0.11.0). These methods now have an ``axis`` parameter which determines how the Series or DataFrame will be aligned with the threshold(s). (:issue:`6966`) - -- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`) - -- ``drop`` function can now accept ``errors`` keyword to suppress ``ValueError`` raised when any of label does not exist in the target data. (:issue:`6736`) - - .. ipython:: python - - df = DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) - df.drop(['A', 'X'], axis=1, errors='ignore') - -- Add support for separating years and quarters using dashes, for - example 2014-Q1. (:issue:`9688`) - -- Allow conversion of values with dtype ``datetime64`` or ``timedelta64`` to strings using ``astype(str)`` (:issue:`9757`) -- ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) -- ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) - -- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) -- Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) - -- Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) -- Allow ``Categorical.add_categories`` to accept ``Series`` or ``np.array``. (:issue:`9927`) +.. _whatsnew_0161.api: -- Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) -- Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) +API changes +~~~~~~~~~~~ -- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` -- ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) -.. _whatsnew_0161.api: - -API changes -~~~~~~~~~~~ - When passing in an ax to ``df.plot( ..., ax=ax)``, the `sharex` kwarg will now default to `False`. The result is that the visibility of xlabels and xticklabels will not anymore be changed. You @@ -265,63 +151,15 @@ API changes If pandas creates the subplots itself (e.g. no passed in `ax` kwarg), then the default is still ``sharex=True`` and the visibility changes are applied. -- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously - the order was arbitrary. (:issue:`9777`) - -- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) - -.. _whatsnew_0161.deprecations: - -Deprecations -^^^^^^^^^^^^ - -- ``Series.str.split``'s ``return_type`` keyword was removed in favor of ``expand`` (:issue:`9847`) - - -.. _whatsnew_0161.index_repr: - -Index Representation -~~~~~~~~~~~~~~~~~~~~ - -The string representation of ``Index`` and its sub-classes have now been unified. These will show a single-line display if there are few values; a wrapped multi-line display for a lot of values (but less than ``display.max_seq_items``; if lots of items (> ``display.max_seq_items``) will show a truncated display (the head and tail of the data). The formatting for ``MultiIndex`` is unchanges (a multi-line wrapped display). The display width responds to the option ``display.max_seq_items``, which is defaulted to 100. (:issue:`6482`) -Previous Behavior -.. code-block:: python - - In [2]: pd.Index(range(4),name='foo') - Out[2]: Int64Index([0, 1, 2, 3], dtype='int64') - - In [3]: pd.Index(range(104),name='foo') - Out[3]: Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...], dtype='int64') - - In [4]: pd.date_range('20130101',periods=4,name='foo',tz='US/Eastern') - Out[4]: - - [2013-01-01 00:00:00-05:00, ..., 2013-01-04 00:00:00-05:00] - Length: 4, Freq: D, Timezone: US/Eastern - - In [5]: pd.date_range('20130101',periods=104,name='foo',tz='US/Eastern') - Out[5]: - - [2013-01-01 00:00:00-05:00, ..., 2013-04-14 00:00:00-04:00] - Length: 104, Freq: D, Timezone: US/Eastern - -New Behavior - -.. ipython:: python +- Add support for separating years and quarters using dashes, for + example 2014-Q1. (:issue:`9688`) - pd.set_option('display.width', 80) - pd.Index(range(4), name='foo') - pd.Index(range(30), name='foo') - pd.Index(range(104), name='foo') - pd.CategoricalIndex(['a','bb','ccc','dddd'], ordered=True, name='foobar') - pd.CategoricalIndex(['a','bb','ccc','dddd']*10, ordered=True, name='foobar') - pd.CategoricalIndex(['a','bb','ccc','dddd']*100, ordered=True, name='foobar') - pd.date_range('20130101',periods=4, name='foo', tz='US/Eastern') - pd.date_range('20130101',periods=25, freq='D') - pd.date_range('20130101',periods=104, name='foo', tz='US/Eastern') +- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously + the order was arbitrary. (:issue:`9777`) +- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) .. _whatsnew_0161.performance: @@ -330,7 +168,8 @@ Performance Improvements - Improved csv write performance with mixed dtypes, including datetimes by up to 5x (:issue:`9940`) - Improved csv write performance generally by 2x (:issue:`9940`) -- Improved the performance of ``pd.lib.max_len_string_array`` by 5-7x (:issue:`10024`) + + .. _whatsnew_0161.bug_fixes: @@ -338,65 +177,80 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug where labels did not appear properly in the legend of ``DataFrame.plot()``, passing ``label=`` arguments works, and Series indices are no longer mutated. (:issue:`9542`) -- Bug in json serialization causing a segfault when a frame had zero length. (:issue:`9805`) +- Fixed bug (:issue:`9542`) where labels did not appear properly in legend of ``DataFrame.plot()``. Passing ``label=`` args also now works, and series indices are no longer mutated. +- Bug in json serialization when frame has length zero.(:issue:`9805`) - Bug in ``read_csv`` where missing trailing delimiters would cause segfault. (:issue:`5664`) - Bug in retaining index name on appending (:issue:`9862`) - Bug in ``scatter_matrix`` draws unexpected axis ticklabels (:issue:`5662`) - Fixed bug in ``StataWriter`` resulting in changes to input ``DataFrame`` upon save (:issue:`9795`). - Bug in ``transform`` causing length mismatch when null entries were present and a fast aggregator was being used (:issue:`9697`) - Bug in ``equals`` causing false negatives when block order differed (:issue:`9330`) -- Bug in grouping with multiple ``pd.Grouper`` where one is non-time based (:issue:`10063`) - Bug in ``read_sql_table`` error when reading postgres table with timezone (:issue:`7139`) - Bug in ``DataFrame`` slicing may not retain metadata (:issue:`9776`) - Bug where ``TimdeltaIndex`` were not properly serialized in fixed ``HDFStore`` (:issue:`9635`) -- Bug with ``TimedeltaIndex`` constructor ignoring ``name`` when given another ``TimedeltaIndex`` as data (:issue:`10025`). -- Bug in ``DataFrameFormatter._get_formatted_index`` with not applying ``max_colwidth`` to the ``DataFrame`` index (:issue:`7856`) -- Bug in ``.loc`` with a read-only ndarray data source (:issue:`10043`) -- Bug in ``groupby.apply()`` that would raise if a passed user defined function either returned only ``None`` (for all input). (:issue:`9685`) -- Always use temporary files in pytables tests (:issue:`9992`) - Bug in plotting continuously using ``secondary_y`` may not show legend properly. (:issue:`9610`, :issue:`9779`) - Bug in ``DataFrame.plot(kind="hist")`` results in ``TypeError`` when ``DataFrame`` contains non-numeric columns (:issue:`9853`) - Bug where repeated plotting of ``DataFrame`` with a ``DatetimeIndex`` may raise ``TypeError`` (:issue:`9852`) - Bug in ``setup.py`` that would allow an incompat cython version to build (:issue:`9827`) - Bug in plotting ``secondary_y`` incorrectly attaches ``right_ax`` property to secondary axes specifying itself recursively. (:issue:`9861`) + - Bug in ``Series.quantile`` on empty Series of type ``Datetime`` or ``Timedelta`` (:issue:`9675`) - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`) - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`) - Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`) -- Not showing a ``DeprecationWarning`` on combining list-likes with an ``Index`` (:issue:`10083`) - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) - Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) - Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) - Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) + - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) -- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) -- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) -- Bug in C csv parser causing spurious NaNs when data started with newline followed by whitespace. (:issue:`10022`) -- Bug causing elements with a null group to spill into the final group when grouping by a ``Categorical`` (:issue:`9603`) -- Bug where .iloc and .loc behavior is not consistent on empty dataframes (:issue:`9964`) -- Bug in invalid attribute access on a ``TimedeltaIndex`` incorrectly raised ``ValueError`` instead of ``AttributeError`` (:issue:`9680`) + + + + + + + + + + + + + + - Bug in unequal comparisons between categorical data and a scalar, which was not in the categories (e.g. ``Series(Categorical(list("abc"), ordered=True)) > "d"``. This returned ``False`` for all elements, but now raises a ``TypeError``. Equality comparisons also now return ``False`` for ``==`` and ``True`` for ``!=``. (:issue:`9848`) - Bug in DataFrame ``__setitem__`` when right hand side is a dictionary (:issue:`9874`) - Bug in ``where`` when dtype is ``datetime64/timedelta64``, but dtype of other is not (:issue:`9804`) - Bug in ``MultiIndex.sortlevel()`` results in unicode level name breaks (:issue:`9856`) - Bug in which ``groupby.transform`` incorrectly enforced output dtypes to match input dtypes. (:issue:`9807`) -- Bug in ``DataFrame`` constructor when ``columns`` parameter is set, and ``data`` is an empty list (:issue:`9939`) + - Bug in bar plot with ``log=True`` raises ``TypeError`` if all values are less than 1 (:issue:`9905`) - Bug in horizontal bar plot ignores ``log=True`` (:issue:`9905`) -- Bug in PyTables queries that did not return proper results using the index (:issue:`8265`, :issue:`9676`) + + + + + + + + - Bug where dividing a dataframe containing values of type ``Decimal`` by another ``Decimal`` would raise. (:issue:`9787`) - Bug where using DataFrames asfreq would remove the name of the index. (:issue:`9885`) -- Bug causing extra index point when resample BM/BQ (:issue:`9756`) - Changed caching in ``AbstractHolidayCalendar`` to be at the instance level rather than at the class level as the latter can result in unexpected behaviour. (:issue:`9552`) + - Fixed latex output for multi-indexed dataframes (:issue:`9778`) - Bug causing an exception when setting an empty range using ``DataFrame.loc`` (:issue:`9596`) + + - Bug in hiding ticklabels with subplots and shared axes when adding a new plot to an existing grid of axes (:issue:`9158`) -- Bug in ``transform`` and ``filter`` when grouping on a categorical variable (:issue:`9921`) -- Bug in ``transform`` when groups are equal in number and dtype to the input index (:issue:`9700`) -- Google BigQuery connector now imports dependencies on a per-method basis.(:issue:`9713`) -- Updated BigQuery connector to no longer use deprecated ``oauth2client.tools.run()`` (:issue:`8327`) -- Bug in subclassed ``DataFrame``. It may not return the correct class, when slicing or subsetting it. (:issue:`9632`) -- Bug in ``.median()`` where non-float null values are not handled correctly (:issue:`10040`) -- Bug in Series.fillna() where it raises if a numerically convertible string is given (:issue:`10092`) + + + + + + + + + +- ``Union`` and ``intersection`` now wont change index name. (:issue:`9943`) diff --git a/pandas/core/index.py b/pandas/core/index.py index de30fee4009f4..e135b193e0d78 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1370,14 +1370,17 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self + other = _ensure_index(other) + if len(self) == 0: - return _ensure_index(other) + return other self._assert_can_do_setop(other) if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') + return this.union(other) if self.is_monotonic and other.is_monotonic: @@ -1423,7 +1426,7 @@ def union(self, other): return self._wrap_union_result(other, result) def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + name = self.name if other.name == self.name or other.name==None else None return self.__class__(data=result, name=name) def intersection(self, other): @@ -1457,6 +1460,7 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(self.values, other.values)[0] + return self._wrap_union_result(other, result) except TypeError: pass @@ -1470,8 +1474,9 @@ def intersection(self, other): indexer = indexer[indexer != -1] taken = self.take(indexer) - if self.name != other.name: + if self.name != other.name and not other.name == None: taken.name = None + return taken def difference(self, other): @@ -5464,7 +5469,9 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - result_names = self.names if self.names == other.names else None + result_names = None + if self.names == other.names or other.names is None: + result_names = self.names uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, @@ -5487,7 +5494,9 @@ def intersection(self, other): if self.equals(other): return self - result_names = self.names if self.names == other.names else None + result_names = None + if self.names == other.names or other.name is None: + result_names = self.names self_tuples = self.values other_tuples = other.values @@ -5520,7 +5529,9 @@ def difference(self, other): ' tuples') result_names = self.names else: - result_names = self.names if self.names == other.names else None + result_names = None + if self.names == other.names or other.names == None: + result_names = self.names if self.equals(other): return MultiIndex(levels=[[]] * self.nlevels, @@ -5615,7 +5626,9 @@ def _bounds(self): return self.__bounds def _wrap_joined_index(self, joined, other): - names = self.names if self.names == other.names else None + names = None + if self.names == other.names or other.names == None: + names = self.names return MultiIndex.from_tuples(joined, names=names) @Appender(Index.isin.__doc__) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 444aa2a0bab1e..9d88509d07dfe 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -17,7 +17,7 @@ CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, - assert_copy) + assert_copy, assert_frame_equal) from pandas import compat from pandas.compat import long @@ -616,6 +616,157 @@ def test_shift(self): shifted.name = 'shifted' self.assertEqual(shifted.name, shifted.shift(1, 'D').name) + def test_union_naming_behavior(self): + #9965 + idx_name_a = pd.Index([1,2,3], name='a') + idx_name_b = pd.Index([4,5,6], name='b') + idx2_name_a = pd.Index([2,9,8], name='a') + + stridx_name_stra = pd.Index(['1','2'], name='stra') + stridx_name_a = pd.Index(['1','2'], name='a') + + idx_name_none = pd.Index(['1','2'], name=None) + + dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') + dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') + dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) + + python_array = [1,2,3] + numpy_array = np.array([1,2,3]) + + #index union index naming behavior + self.assertEqual(idx_name_a.union(idx_name_b).name, None) + self.assertEqual(idx_name_a.union(idx2_name_a).name, 'a') + + #index union array + self.assertEqual(idx_name_a.union(python_array).name, 'a') + self.assertEqual(idx_name_a.union(numpy_array).name, 'a') + + #index union index different dtype + self.assertEqual(idx_name_a.union(stridx_name_a).name, 'a') + self.assertEqual(idx_name_a.union(stridx_name_stra).name, None) + + #index union index with no name + self.assertEqual(idx_name_a.union(idx_name_none).name, 'a') + + #index union dateindex + self.assertEqual(idx_name_a.union(dateindex_name_a).name, 'a') + self.assertEqual(idx_name_a.union(dateindex_name_b).name, None) + self.assertEqual(idx_name_a.union(dateindex_name_None).name, 'a') + + #dateindex union + self.assertEqual(dateindex_name_a.union(python_array).name, 'a') + self.assertEqual(dateindex_name_a.union(numpy_array).name, 'a') + + self.assertEqual(dateindex_name_a.union(idx_name_none).name, 'a') + self.assertEqual(dateindex_name_a.union(dateindex_name_b).name, None) + self.assertEqual(dateindex_name_a.union(dateindex_name_None).name, 'a') + self.assertEqual(dateindex_name_a.union(idx_name_a).name, 'a') + + def test_intersection_naming_behavior(self): + #9965 + idx_name_a = pd.Index([1,2,3], name='a') + idx_name_b = pd.Index([4,5,6], name='b') + idx2_name_a = pd.Index([2,9,8], name='a') + + stridx_name_stra = pd.Index(['1','2'], name='stra') + stridx_name_a = pd.Index(['1','2'], name='a') + + idx_name_none = pd.Index(['1','2'], name=None) + + dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') + dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') + dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) + + python_array = [1,2,3] + numpy_array = np.array([1,2,3]) + + #index intersection index naming behavior + self.assertEqual(idx_name_a.intersection(idx_name_b).name, None) + self.assertEqual(idx_name_a.intersection(idx2_name_a).name, 'a') + + #index intersection array + self.assertEqual(idx_name_a.intersection(python_array).name, 'a') + self.assertEqual(idx_name_a.intersection(numpy_array).name, 'a') + + #index intersection index different dtype + self.assertEqual(idx_name_a.intersection(stridx_name_a).name, 'a') + self.assertEqual(idx_name_a.intersection(stridx_name_stra).name, None) + + #index intersection index with no name + self.assertEqual(idx_name_a.intersection(idx_name_none).name, 'a') + + #index intersection dateindex + self.assertEqual(idx_name_a.intersection(dateindex_name_a).name, 'a') + self.assertEqual(idx_name_a.intersection(dateindex_name_b).name, None) + self.assertEqual(idx_name_a.intersection(dateindex_name_None).name, 'a') + + #dateindex intersection + self.assertEqual(dateindex_name_a.intersection(python_array).name, 'a') + self.assertEqual(dateindex_name_a.intersection(numpy_array).name, 'a') + + self.assertEqual(dateindex_name_a.intersection(idx_name_none).name, 'a') + self.assertEqual(dateindex_name_a.intersection(dateindex_name_b).name, None) + self.assertEqual(dateindex_name_a.intersection(dateindex_name_None).name, 'a') + self.assertEqual(dateindex_name_a.intersection(idx_name_a).name, 'a') + + def test_append_naming_behavior(self): + #9965 + idx_name_a = pd.Index([1,2,3], name='a') + idx_name_b = pd.Index([4,5,6], name='b') + idx2_name_a = pd.Index([2,9,8], name='a') + + stridx_name_stra = pd.Index(['1','2'], name='stra') + stridx_name_a = pd.Index(['1','2'], name='a') + + idx_name_none = pd.Index(['1','2'], name=None) + + dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') + dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') + dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) + + python_array = [1,2,3] + python_array_transposed = [[1],[2],[3]] + numpy_array = np.array([1,2,3]) + + #index append index naming behavior + self.assertEqual(idx_name_a.append(idx_name_b).name, None) + self.assertEqual(idx_name_a.append(idx2_name_a).name, 'a') + + #index append array + self.assertEqual(idx_name_a.append(python_array_transposed).name, 'a') + self.assertEqual(idx_name_a.append(numpy_array.T).name, 'a') + + #index append index different dtype + self.assertEqual(idx_name_a.append(stridx_name_a).name, 'a') + self.assertEqual(idx_name_a.append(stridx_name_stra).name, None) + + #index append index with no name + self.assertEqual(idx_name_a.append(idx_name_none).name, 'a') + + #index append dateindex + self.assertEqual(idx_name_a.append(dateindex_name_a).name, 'a') + self.assertEqual(idx_name_a.append(dateindex_name_b).name, None) + self.assertEqual(idx_name_a.append(dateindex_name_None).name, 'a') + + #dateindex append + self.assertEqual(dateindex_name_a.append(python_array_transposed).name, 'a') + self.assertEqual(dateindex_name_a.append(numpy_array.T).name, 'a') + + self.assertEqual(dateindex_name_a.append(idx_name_none).name, 'a') + self.assertEqual(dateindex_name_a.append(dateindex_name_b).name, None) + self.assertEqual(dateindex_name_a.append(dateindex_name_None).name, 'a') + self.assertEqual(dateindex_name_a.append(idx_name_a).name, 'a') + + def test_intersection_preserves_name(self): + #GH 9943 + df = pd.DataFrame([np.nan, np.nan], columns = ['tags'], index=pd.Int64Index([4815961, 4815962], dtype='int64', name='id')) + self.assertEqual(str(df), ' tags\nid \n4815961 NaN\n4815962 NaN') + L = [4815962] + self.assertEqual(list(L), list(df.index.intersection(L))) + self.assertEqual( df.ix[L].tags.index.name, df.ix[df.index.intersection(L)].tags.index.name) + assert_frame_equal(df.ix[L], df.ix[df.index.intersection(L)]) + def test_intersection(self): first = self.strIndex[:20] second = self.strIndex[:10] @@ -694,6 +845,10 @@ def test_union(self): union = first.union(second) self.assertIsNone(union.name) + #union array-like + union = first.union([1234]) + self.assertEqual('A', union.name) + def test_add(self): # - API change GH 8226 diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f3803a04baf01..e8ae0a75721ce 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -870,7 +870,10 @@ def append(self, other): to_concat.append(other) for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: + if (isinstance(obj, Index) and + obj.name != name and + obj.name is not None): + name = None break @@ -1017,7 +1020,7 @@ def __iter__(self): yield v def _wrap_union_result(self, other, result): - name = self.name if self.name == other.name else None + name = self.name if self.name == other.name or other.name == None else None if self.tz != other.tz: raise ValueError('Passed item and index have different timezone') return self._simple_new(result, name=name, freq=None, tz=self.tz) From 3e515cee291b5449e322256a2a6557d7049d71ca Mon Sep 17 00:00:00 2001 From: Henning Sperr Date: Wed, 13 May 2015 07:47:13 +0900 Subject: [PATCH 2/3] FIX: Reworked the tests, removed the equal None --- pandas/core/index.py | 22 +++- pandas/tests/test_index.py | 231 +++++++++++++++---------------------- 2 files changed, 109 insertions(+), 144 deletions(-) diff --git a/pandas/core/index.py b/pandas/core/index.py index e135b193e0d78..7f95ac3af1dc5 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1368,11 +1368,18 @@ def union(self, other): raise TypeError('Input must be iterable.') if len(other) == 0 or self.equals(other): + if (len(other) and + hasattr(other, 'name') and not + other.name == self.name and not + other.name is None): + self.name = None return self other = _ensure_index(other) if len(self) == 0: + if not other.name == self.name: + other.name = None return other self._assert_can_do_setop(other) @@ -1426,7 +1433,10 @@ def union(self, other): return self._wrap_union_result(other, result) def _wrap_union_result(self, other, result): - name = self.name if other.name == self.name or other.name==None else None + name = None + if self.name == other.name or other.name is None: + name = self.name + return self.__class__(data=result, name=name) def intersection(self, other): @@ -1450,6 +1460,10 @@ def intersection(self, other): other = _ensure_index(other) if self.equals(other): + if (hasattr(other, 'name') + and not other.name is None + and not other.name == self.name): + self.name = None return self if not is_dtype_equal(self.dtype,other.dtype): @@ -1474,7 +1488,7 @@ def intersection(self, other): indexer = indexer[indexer != -1] taken = self.take(indexer) - if self.name != other.name and not other.name == None: + if self.name != other.name and not other.name is None: taken.name = None return taken @@ -5530,7 +5544,7 @@ def difference(self, other): result_names = self.names else: result_names = None - if self.names == other.names or other.names == None: + if self.names == other.names or other.names is None: result_names = self.names if self.equals(other): @@ -5627,7 +5641,7 @@ def _bounds(self): def _wrap_joined_index(self, joined, other): names = None - if self.names == other.names or other.names == None: + if self.names == other.names or other.names is None: names = self.names return MultiIndex.from_tuples(joined, names=names) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9d88509d07dfe..25489af84f3fa 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -17,7 +17,7 @@ CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex) from pandas.core.index import InvalidIndexError, NumericIndex from pandas.util.testing import (assert_almost_equal, assertRaisesRegexp, - assert_copy, assert_frame_equal) + assert_copy) from pandas import compat from pandas.compat import long @@ -272,8 +272,8 @@ def setUp(self): ) self.setup_indices() - def create_index(self): - return Index(list('abcde')) + def create_index(self, content=list('abcde'), name=None): + return Index(content, name=name) def test_new_axis(self): new_index = self.dateIndex[None, :] @@ -618,154 +618,105 @@ def test_shift(self): def test_union_naming_behavior(self): #9965 - idx_name_a = pd.Index([1,2,3], name='a') - idx_name_b = pd.Index([4,5,6], name='b') - idx2_name_a = pd.Index([2,9,8], name='a') - - stridx_name_stra = pd.Index(['1','2'], name='stra') - stridx_name_a = pd.Index(['1','2'], name='a') - - idx_name_none = pd.Index(['1','2'], name=None) - - dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') - dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') - dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) - - python_array = [1,2,3] - numpy_array = np.array([1,2,3]) - - #index union index naming behavior - self.assertEqual(idx_name_a.union(idx_name_b).name, None) - self.assertEqual(idx_name_a.union(idx2_name_a).name, 'a') - - #index union array - self.assertEqual(idx_name_a.union(python_array).name, 'a') - self.assertEqual(idx_name_a.union(numpy_array).name, 'a') - - #index union index different dtype - self.assertEqual(idx_name_a.union(stridx_name_a).name, 'a') - self.assertEqual(idx_name_a.union(stridx_name_stra).name, None) - - #index union index with no name - self.assertEqual(idx_name_a.union(idx_name_none).name, 'a') - - #index union dateindex - self.assertEqual(idx_name_a.union(dateindex_name_a).name, 'a') - self.assertEqual(idx_name_a.union(dateindex_name_b).name, None) - self.assertEqual(idx_name_a.union(dateindex_name_None).name, 'a') - - #dateindex union - self.assertEqual(dateindex_name_a.union(python_array).name, 'a') - self.assertEqual(dateindex_name_a.union(numpy_array).name, 'a') - - self.assertEqual(dateindex_name_a.union(idx_name_none).name, 'a') - self.assertEqual(dateindex_name_a.union(dateindex_name_b).name, None) - self.assertEqual(dateindex_name_a.union(dateindex_name_None).name, 'a') - self.assertEqual(dateindex_name_a.union(idx_name_a).name, 'a') + self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='union') + self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='union') + self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='union') + self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='union') def test_intersection_naming_behavior(self): #9965 - idx_name_a = pd.Index([1,2,3], name='a') - idx_name_b = pd.Index([4,5,6], name='b') - idx2_name_a = pd.Index([2,9,8], name='a') - - stridx_name_stra = pd.Index(['1','2'], name='stra') - stridx_name_a = pd.Index(['1','2'], name='a') - - idx_name_none = pd.Index(['1','2'], name=None) - - dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') - dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') - dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) - - python_array = [1,2,3] - numpy_array = np.array([1,2,3]) - - #index intersection index naming behavior - self.assertEqual(idx_name_a.intersection(idx_name_b).name, None) - self.assertEqual(idx_name_a.intersection(idx2_name_a).name, 'a') - - #index intersection array - self.assertEqual(idx_name_a.intersection(python_array).name, 'a') - self.assertEqual(idx_name_a.intersection(numpy_array).name, 'a') - - #index intersection index different dtype - self.assertEqual(idx_name_a.intersection(stridx_name_a).name, 'a') - self.assertEqual(idx_name_a.intersection(stridx_name_stra).name, None) - - #index intersection index with no name - self.assertEqual(idx_name_a.intersection(idx_name_none).name, 'a') - - #index intersection dateindex - self.assertEqual(idx_name_a.intersection(dateindex_name_a).name, 'a') - self.assertEqual(idx_name_a.intersection(dateindex_name_b).name, None) - self.assertEqual(idx_name_a.intersection(dateindex_name_None).name, 'a') - - #dateindex intersection - self.assertEqual(dateindex_name_a.intersection(python_array).name, 'a') - self.assertEqual(dateindex_name_a.intersection(numpy_array).name, 'a') - - self.assertEqual(dateindex_name_a.intersection(idx_name_none).name, 'a') - self.assertEqual(dateindex_name_a.intersection(dateindex_name_b).name, None) - self.assertEqual(dateindex_name_a.intersection(dateindex_name_None).name, 'a') - self.assertEqual(dateindex_name_a.intersection(idx_name_a).name, 'a') + self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='intersection') + self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='intersection') + self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='intersection') + self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='intersection') def test_append_naming_behavior(self): #9965 - idx_name_a = pd.Index([1,2,3], name='a') - idx_name_b = pd.Index([4,5,6], name='b') - idx2_name_a = pd.Index([2,9,8], name='a') - - stridx_name_stra = pd.Index(['1','2'], name='stra') - stridx_name_a = pd.Index(['1','2'], name='a') - - idx_name_none = pd.Index(['1','2'], name=None) - - dateindex_name_a = pd.DatetimeIndex([datetime.today()], name='a') - dateindex_name_b = pd.DatetimeIndex([datetime.today()], name='b') - dateindex_name_None = pd.DatetimeIndex([datetime.today()], name=None) - - python_array = [1,2,3] - python_array_transposed = [[1],[2],[3]] - numpy_array = np.array([1,2,3]) - - #index append index naming behavior - self.assertEqual(idx_name_a.append(idx_name_b).name, None) - self.assertEqual(idx_name_a.append(idx2_name_a).name, 'a') - - #index append array - self.assertEqual(idx_name_a.append(python_array_transposed).name, 'a') - self.assertEqual(idx_name_a.append(numpy_array.T).name, 'a') - - #index append index different dtype - self.assertEqual(idx_name_a.append(stridx_name_a).name, 'a') - self.assertEqual(idx_name_a.append(stridx_name_stra).name, None) - - #index append index with no name - self.assertEqual(idx_name_a.append(idx_name_none).name, 'a') - - #index append dateindex - self.assertEqual(idx_name_a.append(dateindex_name_a).name, 'a') - self.assertEqual(idx_name_a.append(dateindex_name_b).name, None) - self.assertEqual(idx_name_a.append(dateindex_name_None).name, 'a') - - #dateindex append - self.assertEqual(dateindex_name_a.append(python_array_transposed).name, 'a') - self.assertEqual(dateindex_name_a.append(numpy_array.T).name, 'a') - - self.assertEqual(dateindex_name_a.append(idx_name_none).name, 'a') - self.assertEqual(dateindex_name_a.append(dateindex_name_b).name, None) - self.assertEqual(dateindex_name_a.append(dateindex_name_None).name, 'a') - self.assertEqual(dateindex_name_a.append(idx_name_a).name, 'a') + self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='append') + self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='append') + self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='append') + self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='append') + + def compare_naming(self, equal_values, disjunct_values, intersect_values, func_name='union'): + ''' + given arrays of values checks whether the function specified + keeps the naming convention + + euqual_values : values to be used for equal comparison of func_name + disjunct_values : values to be used for disjunct comparison of func_name + intersect_values : values to be used for intersect comparison of func_name + + func_name : either union, append, intersection + ''' + idx_name_a = self.create_index(content=equal_values, name='a') + idx_name_a_function = getattr(idx_name_a, func_name) + + equal_idx_name_a = self.create_index(content=equal_values, name='a') + equal_idx_name_b = self.create_index(content=equal_values, name='b') + equal_idx_name_none = self.create_index(content=equal_values, name=None) + equal_python_array = equal_values + equal_numpy_array = np.array(equal_values) + + empty_idx_name_a = self.create_index(content=[], name='a') + empty_idx_name_b = self.create_index(content=[], name='b') + empty_idx_name_none = self.create_index(content=[], name=None) + empty_python_array = [] + empty_numpy_array = np.array([]) + + disjunct_idx_name_a = self.create_index(content=disjunct_values, name='a') + disjunct_idx_name_b = self.create_index(content=disjunct_values, name='b') + disjunct_idx_name_none = self.create_index(content=disjunct_values, name=None) + disjunct_python_array = disjunct_values + disjunct_numpy_array = np.array(disjunct_values) + + intersect_idx_name_a = self.create_index(content=intersect_values, name='a') + intersect_idx_name_b = self.create_index(content=intersect_values, name='b') + intersect_idx_name_none = self.create_index(content=intersect_values, name=None) + intersect_python_array = intersect_values + intersect_numpy_array = np.array(intersect_values) + + #index union naming behavior in equal contents + self.assertEqual(idx_name_a_function(equal_idx_name_a).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(equal_idx_name_b).name, None) + self.assertEqual(idx_name_a_function(equal_idx_name_none).name, idx_name_a.name) + #self.assertEqual(idx_name_a_function(equal_python_array).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(equal_numpy_array).name, idx_name_a.name) + + #index union naming behavior in empty second index + self.assertEqual(idx_name_a_function(empty_idx_name_a).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(empty_idx_name_b).name, None) + self.assertEqual(idx_name_a_function(empty_idx_name_none).name, idx_name_a.name) + #self.assertEqual(idx_name_a_function(empty_python_array).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(empty_numpy_array).name, idx_name_a.name) + + #index union naming behavior with disjunct contents + self.assertEqual(idx_name_a_function(disjunct_idx_name_a).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(disjunct_idx_name_b).name, None) + self.assertEqual(idx_name_a_function(disjunct_idx_name_none).name, idx_name_a.name) + #self.assertEqual(idx_name_a_function(disjunct_python_array).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(disjunct_numpy_array).name, idx_name_a.name) + + #index union naming behavior with intersecting content + self.assertEqual(idx_name_a_function(intersect_idx_name_a).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(intersect_idx_name_b).name, None) + self.assertEqual(idx_name_a_function(intersect_idx_name_none).name, idx_name_a.name) + #self.assertEqual(idx_name_a_function(intersect_python_array).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(intersect_numpy_array).name, idx_name_a.name) def test_intersection_preserves_name(self): #GH 9943 - df = pd.DataFrame([np.nan, np.nan], columns = ['tags'], index=pd.Int64Index([4815961, 4815962], dtype='int64', name='id')) - self.assertEqual(str(df), ' tags\nid \n4815961 NaN\n4815962 NaN') + df = pd.DataFrame([np.nan, np.nan], + columns = ['tags'], + index=pd.Int64Index([4815961, 4815962], + dtype='int64', name='id')) + self.assertEqual(str(df), + ' tags\nid \n4815961 NaN\n4815962 NaN') L = [4815962] self.assertEqual(list(L), list(df.index.intersection(L))) - self.assertEqual( df.ix[L].tags.index.name, df.ix[df.index.intersection(L)].tags.index.name) - assert_frame_equal(df.ix[L], df.ix[df.index.intersection(L)]) + self.assertEqual( df.ix[L].tags.index.name, + df.ix[df.index.intersection(L)].tags.index.name) + self.assertEqual( df.ix[L].index.name, + df.ix[df.index.intersection(L)].index.name) def test_intersection(self): first = self.strIndex[:20] From 4dc1a4d7b1bb6e79b09a5d270e88eba22c8e8fef Mon Sep 17 00:00:00 2001 From: Henning Sperr Date: Sun, 17 May 2015 12:54:20 +0900 Subject: [PATCH 3/3] FIX: more shortpaths, refactor tests --- pandas/core/common.py | 2 +- pandas/core/index.py | 79 +++++++---------- pandas/tests/test_index.py | 173 ++++++++++++++----------------------- pandas/tseries/index.py | 32 ++++--- 4 files changed, 118 insertions(+), 168 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 3c92300d1f9a5..af9d9b63d8b3f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3323,7 +3323,7 @@ def save(obj, path): # TODO remove in 0.13 def _maybe_match_name(a, b): a_name = getattr(a, 'name', None) - b_name = getattr(b, 'name', None) + b_name = getattr(b, 'name', a_name) if a_name == b_name: return a_name return None diff --git a/pandas/core/index.py b/pandas/core/index.py index 7f95ac3af1dc5..fab0880e5beb1 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -21,7 +21,7 @@ from pandas.core.common import (isnull, array_equivalent, is_dtype_equal, is_object_dtype, _values_from_object, is_float, is_integer, is_iterator, is_categorical_dtype, ABCSeries, ABCCategorical, _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype) + is_list_like, is_bool_dtype, is_null_slice, is_integer_dtype, _maybe_match_name) from pandas.core.config import get_option from pandas.io.common import PerformanceWarning @@ -1088,7 +1088,6 @@ def _ensure_compat_append(self, other): ------- list of to_concat, name of result Index """ - name = self.name to_concat = [self] if isinstance(other, (list, tuple)): @@ -1097,16 +1096,13 @@ def _ensure_compat_append(self, other): to_concat.append(other) for obj in to_concat: - if (isinstance(obj, Index) and - obj.name != name and - obj.name is not None): - name = None - break + result_name = _maybe_match_name(self, obj) + self.name = result_name to_concat = self._ensure_compat_concat(to_concat) to_concat = [x.values if isinstance(x, Index) else x for x in to_concat] - return to_concat, name + return to_concat, result_name def append(self, other): """ @@ -1367,27 +1363,23 @@ def union(self, other): if not hasattr(other, '__iter__'): raise TypeError('Input must be iterable.') + result_name = _maybe_match_name(self, other) + if len(other) == 0 or self.equals(other): - if (len(other) and - hasattr(other, 'name') and not - other.name == self.name and not - other.name is None): - self.name = None + self.name = result_name return self - other = _ensure_index(other) - + other = _ensure_index(other, copy=True) if len(self) == 0: - if not other.name == self.name: - other.name = None + other.name = result_name return other self._assert_can_do_setop(other) - if not is_dtype_equal(self.dtype,other.dtype): + #FIXME: right now crashes if we union with python array + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') - return this.union(other) if self.is_monotonic and other.is_monotonic: @@ -1430,13 +1422,9 @@ def union(self, other): "incomparable objects" % e, RuntimeWarning) # for subclasses - return self._wrap_union_result(other, result) - - def _wrap_union_result(self, other, result): - name = None - if self.name == other.name or other.name is None: - name = self.name + return self._wrap_union_result(other, result, result_name) + def _wrap_union_result(self, other, result, name=None): return self.__class__(data=result, name=name) def intersection(self, other): @@ -1457,25 +1445,24 @@ def intersection(self, other): self._assert_can_do_setop(other) + result_name = _maybe_match_name(self, other) + other = _ensure_index(other) if self.equals(other): - if (hasattr(other, 'name') - and not other.name is None - and not other.name == self.name): - self.name = None + self.name = result_name return self if not is_dtype_equal(self.dtype,other.dtype): this = self.astype('O') other = other.astype('O') + other.name = result_name return this.intersection(other) if self.is_monotonic and other.is_monotonic: try: result = self._inner_indexer(self.values, other.values)[0] - - return self._wrap_union_result(other, result) + return self._wrap_union_result(other, result, result_name) except TypeError: pass @@ -1488,8 +1475,7 @@ def intersection(self, other): indexer = indexer[indexer != -1] taken = self.take(indexer) - if self.name != other.name and not other.name is None: - taken.name = None + taken.name = result_name return taken @@ -1515,14 +1501,12 @@ def difference(self, other): if not hasattr(other, '__iter__'): raise TypeError('Input must be iterable!') + result_name = _maybe_match_name(self, other) if self.equals(other): - return Index([], name=self.name) + return Index([], name=result_name) if not isinstance(other, Index): other = np.asarray(other) - result_name = self.name - else: - result_name = self.name if self.name == other.name else None theDiff = sorted(set(self) - set(other)) return Index(theDiff, name=result_name) @@ -1567,9 +1551,11 @@ def sym_diff(self, other, result_name=None): if not hasattr(other, '__iter__'): raise TypeError('Input must be iterable!') + if result_name is None: + result_name = _maybe_match_name(self, other) + if not isinstance(other, Index): other = Index(other) - result_name = result_name or self.name the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) @@ -2880,6 +2866,7 @@ def _create_from_codes(self, codes, categories=None, ordered=None, name=None): ordered = self.ordered if name is None: name = self.name + cat = Categorical.from_codes(codes, categories=categories, ordered=self.ordered) return CategoricalIndex(cat, name=name) @@ -3260,7 +3247,10 @@ def append(self, other): to_concat, name = self._ensure_compat_append(other) to_concat = [ self._is_dtype_compat(c) for c in to_concat ] codes = np.concatenate([ c.codes for c in to_concat ]) - return self._create_from_codes(codes, name=name) + new_index = self._create_from_codes(codes, name=name) + #if name should be set to None the create_from_codes method overrides that + new_index.name = name + return new_index @classmethod def _add_comparison_methods(cls): @@ -4420,7 +4410,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): of iterables """ from pandas.core.categorical import Categorical - if len(arrays) == 1: name = None if names is None else names[0] return Index(arrays[0], name=name) @@ -4430,7 +4419,6 @@ def from_arrays(cls, arrays, sortorder=None, names=None): labels = [c.codes for c in cats] if names is None: names = [c.name for c in cats] - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, names=names, verify_integrity=False) @@ -5480,13 +5468,12 @@ def union(self, other): """ self._assert_can_do_setop(other) + result_names = self.names if hasattr(other,'names') and self.names == other.names else [] + if len(other) == 0 or self.equals(other): + self.names = result_names return self - result_names = None - if self.names == other.names or other.names is None: - result_names = self.names - uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -5509,7 +5496,7 @@ def intersection(self, other): return self result_names = None - if self.names == other.names or other.name is None: + if self.names == other.names or other.names is None: result_names = self.names self_tuples = self.values diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 25489af84f3fa..14d2c26b7bfea 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -87,6 +87,32 @@ def test_logical_compat(self): 'cannot perform any', lambda : idx.any()) + def compare_naming(self, index_a_values, index_b_values, func_name='union'): + """ + given arrays of values checks whether the function specified + keeps the naming convention + + func_name : either union, append, intersection + """ + idx_name_a = self.create_index(content=index_a_values, name='a') + idx_name_a_function = getattr(idx_name_a, func_name) + + equal_idx_name_a = self.create_index(content=index_b_values, name='a') + equal_idx_name_b = self.create_index(content=index_b_values, name='b') + equal_idx_name_none = self.create_index(content=index_b_values, name=None) + equal_numpy_array = np.array(index_b_values) + + equal_python_array = index_b_values + if func_name == 'append': + equal_python_array = np.array(np.matrix(equal_python_array).T).tolist() + + #index union naming behavior in equal contents + self.assertEqual(idx_name_a_function(equal_idx_name_a).name, idx_name_a.name) + self.assertTrue(idx_name_a_function(equal_idx_name_b).name is None) + self.assertTrue(idx_name_a_function(equal_idx_name_none).name is None) + self.assertEqual(idx_name_a_function(equal_python_array).name, idx_name_a.name) + self.assertEqual(idx_name_a_function(equal_numpy_array).name, idx_name_a.name) + def test_boolean_context_compat(self): # boolean context compat @@ -272,7 +298,9 @@ def setUp(self): ) self.setup_indices() - def create_index(self, content=list('abcde'), name=None): + def create_index(self, content=None, name=None): + if content is None: + content=list('abcde') return Index(content, name=name) def test_new_axis(self): @@ -314,6 +342,14 @@ def test_constructor_corner(self): # corner case self.assertRaises(TypeError, Index, 0) + def test_name_is_preserved_on_operations(self): + #9965 + for func_name in ('append', 'intersection', 'union'): + self.compare_naming([1,2,3],[1,2,3], func_name=func_name) + self.compare_naming([1,2,3],[3,4,5], func_name=func_name) + self.compare_naming([1,2,3],[5,6,7], func_name=func_name) + self.compare_naming([1,2,3],[], func_name=func_name) + def test_constructor_from_series(self): expected = DatetimeIndex([Timestamp('20110101'),Timestamp('20120101'),Timestamp('20130101')]) @@ -616,107 +652,18 @@ def test_shift(self): shifted.name = 'shifted' self.assertEqual(shifted.name, shifted.shift(1, 'D').name) - def test_union_naming_behavior(self): - #9965 - self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='union') - self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='union') - self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='union') - self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='union') - - def test_intersection_naming_behavior(self): - #9965 - self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='intersection') - self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='intersection') - self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='intersection') - self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='intersection') - - def test_append_naming_behavior(self): - #9965 - self.compare_naming([1,2,3],[4,5,6],[2,5,6], func_name='append') - self.compare_naming(['1','2','3'],['4','5','6'],['2','5','6'], func_name='append') - self.compare_naming([1,'2',3],[4,'5','6'],[2,3,6], func_name='append') - self.compare_naming([1.,2.,3.],[4.,5.,6.],[2.,3.,6.], func_name='append') - - def compare_naming(self, equal_values, disjunct_values, intersect_values, func_name='union'): - ''' - given arrays of values checks whether the function specified - keeps the naming convention - - euqual_values : values to be used for equal comparison of func_name - disjunct_values : values to be used for disjunct comparison of func_name - intersect_values : values to be used for intersect comparison of func_name - - func_name : either union, append, intersection - ''' - idx_name_a = self.create_index(content=equal_values, name='a') - idx_name_a_function = getattr(idx_name_a, func_name) - - equal_idx_name_a = self.create_index(content=equal_values, name='a') - equal_idx_name_b = self.create_index(content=equal_values, name='b') - equal_idx_name_none = self.create_index(content=equal_values, name=None) - equal_python_array = equal_values - equal_numpy_array = np.array(equal_values) - - empty_idx_name_a = self.create_index(content=[], name='a') - empty_idx_name_b = self.create_index(content=[], name='b') - empty_idx_name_none = self.create_index(content=[], name=None) - empty_python_array = [] - empty_numpy_array = np.array([]) - - disjunct_idx_name_a = self.create_index(content=disjunct_values, name='a') - disjunct_idx_name_b = self.create_index(content=disjunct_values, name='b') - disjunct_idx_name_none = self.create_index(content=disjunct_values, name=None) - disjunct_python_array = disjunct_values - disjunct_numpy_array = np.array(disjunct_values) - - intersect_idx_name_a = self.create_index(content=intersect_values, name='a') - intersect_idx_name_b = self.create_index(content=intersect_values, name='b') - intersect_idx_name_none = self.create_index(content=intersect_values, name=None) - intersect_python_array = intersect_values - intersect_numpy_array = np.array(intersect_values) - - #index union naming behavior in equal contents - self.assertEqual(idx_name_a_function(equal_idx_name_a).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(equal_idx_name_b).name, None) - self.assertEqual(idx_name_a_function(equal_idx_name_none).name, idx_name_a.name) - #self.assertEqual(idx_name_a_function(equal_python_array).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(equal_numpy_array).name, idx_name_a.name) - - #index union naming behavior in empty second index - self.assertEqual(idx_name_a_function(empty_idx_name_a).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(empty_idx_name_b).name, None) - self.assertEqual(idx_name_a_function(empty_idx_name_none).name, idx_name_a.name) - #self.assertEqual(idx_name_a_function(empty_python_array).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(empty_numpy_array).name, idx_name_a.name) - - #index union naming behavior with disjunct contents - self.assertEqual(idx_name_a_function(disjunct_idx_name_a).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(disjunct_idx_name_b).name, None) - self.assertEqual(idx_name_a_function(disjunct_idx_name_none).name, idx_name_a.name) - #self.assertEqual(idx_name_a_function(disjunct_python_array).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(disjunct_numpy_array).name, idx_name_a.name) - - #index union naming behavior with intersecting content - self.assertEqual(idx_name_a_function(intersect_idx_name_a).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(intersect_idx_name_b).name, None) - self.assertEqual(idx_name_a_function(intersect_idx_name_none).name, idx_name_a.name) - #self.assertEqual(idx_name_a_function(intersect_python_array).name, idx_name_a.name) - self.assertEqual(idx_name_a_function(intersect_numpy_array).name, idx_name_a.name) - def test_intersection_preserves_name(self): #GH 9943 - df = pd.DataFrame([np.nan, np.nan], - columns = ['tags'], - index=pd.Int64Index([4815961, 4815962], - dtype='int64', name='id')) - self.assertEqual(str(df), - ' tags\nid \n4815961 NaN\n4815962 NaN') + df = pd.DataFrame([np.nan, np.nan], columns=['tags'], + index=pd.Int64Index([4815961, 4815962], dtype='int64', name='id')) + self.assertEqual(str(df), ' tags\nid \n4815961 NaN\n4815962 NaN') L = [4815962] - self.assertEqual(list(L), list(df.index.intersection(L))) - self.assertEqual( df.ix[L].tags.index.name, - df.ix[df.index.intersection(L)].tags.index.name) - self.assertEqual( df.ix[L].index.name, - df.ix[df.index.intersection(L)].index.name) + intersection = df.index.intersection(L) + self.assertEqual(list(L), list(intersection)) + ixdfl = df.ix[L]; + ixdfinter = df.ix[intersection] + self.assertEqual(ixdfl.tags.index.name, ixdfinter.tags.index.name) + self.assertEqual(ixdfl.index.name, ixdfinter.index.name) def test_intersection(self): first = self.strIndex[:20] @@ -781,6 +728,7 @@ def test_union(self): self.assertIs(union, first) union = Index([]).union(first) + #FIXME: Union asserts that empty union returns the second index, this is not documented bug? feature? self.assertIs(union, first) # non-iterable input @@ -1536,10 +1484,14 @@ def setUp(self): self.indices = dict(catIndex = tm.makeCategoricalIndex(100)) self.setup_indices() - def create_index(self, categories=None, ordered=False): + def create_index(self, content=None, categories=None, ordered=False, name=None): if categories is None: categories = list('cab') - return CategoricalIndex(list('aabbca'), categories=categories, ordered=ordered) + + if content is None: + content = list('aabbca') + + return CategoricalIndex(content, categories=categories, ordered=ordered, name=name) def test_construction(self): @@ -1974,8 +1926,10 @@ def setUp(self): float = Float64Index(np.arange(5) * 2.5)) self.setup_indices() - def create_index(self): - return Float64Index(np.arange(5,dtype='float64')) + def create_index(self, content=None, name=None): + if content is None: + content = np.arange(5, dtype='float64') + return Float64Index(content, name=name) def test_repr_roundtrip(self): for ind in (self.mixed, self.float): @@ -2145,8 +2099,10 @@ def setUp(self): self.indices = dict(index = Int64Index(np.arange(0, 20, 2))) self.setup_indices() - def create_index(self): - return Int64Index(np.arange(5,dtype='int64')) + def create_index(self, content=None, name=None): + if content is None: + content =np.arange(5,dtype='int64') + return Int64Index(content, name=name) def test_too_many_names(self): def testit(): @@ -2635,8 +2591,10 @@ def setUp(self): self.indices = dict(index = tm.makeDateIndex(10)) self.setup_indices() - def create_index(self): - return date_range('20130101',periods=5) + def create_index(self, content=None, name=None): + if content is None: + content = '20130101' + return date_range(content, periods=5, name=name) def test_pickle_compat_construction(self): pass @@ -4693,6 +4651,7 @@ def test_index_name_retained(self): 'y': [2, 2, 8], 'z': [-5, 0, 5]}) result = result.set_index('z') + #FIXME: loc seems to use append which resets the index name result.loc[10] = [9, 10] df_expected = pd.DataFrame({'x': [1, 2, 6, 9], 'y': [2, 2, 8, 10], diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index e8ae0a75721ce..571bcc48b336f 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -5,7 +5,7 @@ import numpy as np from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, - ABCSeries, is_integer, is_float) + ABCSeries, is_integer, is_float, _maybe_match_name) from pandas.core.index import Index, Int64Index, Float64Index import pandas.compat as compat from pandas.compat import u @@ -809,7 +809,9 @@ def union(self, other): this, other = self._maybe_utc_convert(other) if this._can_fast_union(other): - return this._fast_union(other) + result = this._fast_union(other) + result.name = _maybe_match_name(self, other) + return result else: result = Index.union(this, other) if isinstance(result, DatetimeIndex): @@ -825,6 +827,8 @@ def union_many(self, others): this = self for other in others: + result_name = _maybe_match_name(this, other) + if not isinstance(this, DatetimeIndex): this = Index.union(this, other) continue @@ -845,6 +849,8 @@ def union_many(self, others): if isinstance(this, DatetimeIndex): this.tz = tz + this.name = result_name + if this.freq is None: this.offset = to_offset(this.inferred_freq) return this @@ -861,7 +867,6 @@ def append(self, other): ------- appended : Index """ - name = self.name to_concat = [self] if isinstance(other, (list, tuple)): @@ -869,17 +874,12 @@ def append(self, other): else: to_concat.append(other) - for obj in to_concat: - if (isinstance(obj, Index) and - obj.name != name and - obj.name is not None): - - name = None - break - to_concat = self._ensure_compat_concat(to_concat) - to_concat, factory = _process_concat_data(to_concat, name) + for element in to_concat: + result_name = _maybe_match_name(self, element) + + to_concat, factory = _process_concat_data(to_concat, result_name) return factory(to_concat) def join(self, other, how='left', level=None, return_indexers=False): @@ -1059,9 +1059,12 @@ def intersection(self, other): result.offset = to_offset(result.inferred_freq) return result + result_name = _maybe_match_name(self, other) if len(self) == 0: + self.name = result_name return self if len(other) == 0: + #FIXME shouldn't we copy this? return other # to make our life easier, "sort" the two ranges if self[0] <= other[0]: @@ -1073,11 +1076,12 @@ def intersection(self, other): start = right[0] if end < start: - return type(self)(data=[]) + return type(self)(data=[], name=result_name) else: lslice = slice(*left.slice_locs(start, end)) left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + result = self._shallow_copy(left_chunk, name=result_name) + return result def _parsed_string_to_bounds(self, reso, parsed): """