From 0aeee8d057233630d39a00fa6b1d38d71a96e334 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 8 May 2016 19:19:30 -0400 Subject: [PATCH 01/96] ENH: inplace dtype changes, df per-column dtype changes; GH7271 --- doc/source/whatsnew/v0.18.2.txt | 4 +- pandas/core/frame.py | 41 +++++++++++++++++ pandas/core/generic.py | 16 +++++-- pandas/tests/frame/test_dtypes.py | 70 ++++++++++++++++++++++++++++++ pandas/tests/series/test_dtypes.py | 13 ++++++ 5 files changed, 138 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fa426aa30bc65..d3d0ed4ba86d1 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -30,8 +30,8 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - - +- The `copy` argument to the ``astype()`` functions has been deprecated in favor of a new ``inplace`` argument. (:issue:`12086`) +- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) .. _whatsnew_0182.api: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b209b6d6ec543..3e55f6f731ca8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3772,6 +3772,47 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Misc methods + def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, + **kwargs): + """ + Cast object to given data type(s). + + Parameters + ---------- + dtype : numpy.dtype or Python type (to cast entire DataFrame to the + same type). Alternatively, {col: dtype, ...}, where col is a column + label and dtype is a numpy.dtype or Python type (to cast one or + more of the DataFrame's columns to column-specific types). + copy : deprecated; use inplace instead + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + raise_on_error : raise on invalid input + kwargs : keyword arguments to pass on to the constructor if + inplace=False + + Returns + ------- + casted : type of caller + """ + if isinstance(dtype, collections.Mapping): + if inplace: + for col, typ in dtype.items(): + self[col].astype(typ, inplace=True, + raise_on_error=raise_on_error) + return None + else: + from pandas.tools.merge import concat + casted_cols = [self[col].astype(typ, copy=copy) + for col, typ in dtype.items()] + other_col_labels = self.columns.difference(dtype.keys()) + other_cols = [self[col].copy() if copy else self[col] + for col in other_col_labels] + new_df = concat(casted_cols + other_cols, axis=1) + return new_df.reindex(columns=self.columns, copy=False) + df = super(DataFrame, self) + return df.astype(dtype=dtype, copy=copy, inplace=inplace, + raise_on_error=raise_on_error, **kwargs) + def first_valid_index(self): """ Return label for first non-NA/null value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c80ab9d87e33..79c80735ac65e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -143,7 +143,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - """Used when a manipulation result has the same dimesions as the + """Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) @@ -2930,14 +2930,17 @@ def blocks(self): """Internal property, property synonym for as_blocks()""" return self.as_blocks() - def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): + def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, + **kwargs): """ Cast object to input numpy.dtype - Return a copy when copy = True (be really careful with this!) Parameters ---------- dtype : numpy.dtype or Python type + copy : deprecated; use inplace instead + inplace : boolean, default False + Modify the NDFrame in place (do not create a new object) raise_on_error : raise on invalid input kwargs : keyword arguments to pass on to the constructor @@ -2945,7 +2948,12 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): ------- casted : type of caller """ - + if inplace: + new_data = self._data.astype(dtype=dtype, copy=False, + raise_on_error=raise_on_error, + **kwargs) + self._update_inplace(new_data) + return mgr = self._data.astype(dtype=dtype, copy=copy, raise_on_error=raise_on_error, **kwargs) return self._constructor(mgr).__finalize__(self) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 97ca8238b78f9..22cb6890a3439 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -372,6 +372,76 @@ def test_astype_str(self): expected = DataFrame(['1.12345678901']) assert_frame_equal(result, expected) + def test_astype_dict(self): + # GH7271 + a = Series(date_range('2010-01-04', periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(['1.0', '2', '3.14', '4', '5.4']) + df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + original = df.copy(deep=True) + + # change type of a subset of columns + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + astyped = df.astype({'b': 'str', 'd': 'float32'}) + assert_frame_equal(astyped, expected) + assert_frame_equal(df, original) + self.assertEqual(astyped.b.dtype, 'object') + self.assertEqual(astyped.d.dtype, 'float32') + + # change all columns + assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), + df.astype(str)) + assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + self.assertRaises(KeyError, df.astype, {'b': str, 2: str}) + self.assertRaises(KeyError, df.astype, {'e': str}) + assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + equiv = df.astype({col: df[col].dtype for col in df.columns}) + assert_frame_equal(df, equiv) + assert_frame_equal(df, original) + + # using inplace=True, the df should be changed + output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) + self.assertEqual(output, None) + assert_frame_equal(df, expected) + df.astype({'b': np.float32, 'c': 'float32', 'd': np.float32}, + inplace=True) + self.assertEqual(df.a.dtype, original.a.dtype) + self.assertEqual(df.b.dtype, 'float32') + self.assertEqual(df.c.dtype, 'float32') + self.assertEqual(df.d.dtype, 'float32') + self.assertEqual(df.b[0], 0.0) + df.astype({'b': str, 'c': 'float64', 'd': np.float64}, inplace=True) + self.assertEqual(df.a.dtype, original.a.dtype) + self.assertEqual(df.b.dtype, 'object') + self.assertEqual(df.c.dtype, 'float64') + self.assertEqual(df.d.dtype, 'float64') + self.assertEqual(df.b[0], '0.0') + + def test_astype_inplace(self): + # GH7271 + df = DataFrame({'a': range(10), + 'b': range(2, 12), + 'c': np.arange(4.0, 14.0, dtype='float64')}) + df.astype('float', inplace=True) + for col in df.columns: + self.assertTrue(df[col].map(lambda x: type(x) == float).all()) + self.assertEqual(df[col].dtype, 'float64') + df.astype('str', inplace=True) + for col in df.columns: + self.assertTrue(df[col].map(lambda x: type(x) == str).all()) + self.assertEqual(df[col].dtype, 'object') + def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index fc963d4597246..d90a08ec5518d 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,6 +133,19 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_inplace(self): + s = Series(np.random.randn(5), name='foo') + + for dtype in ['float32', 'float64', 'int64', 'int32']: + astyped = s.astype(dtype, inplace=False) + self.assertEqual(astyped.dtype, dtype) + self.assertEqual(astyped.name, s.name) + + for dtype in ['float32', 'float64', 'int64', 'int32']: + s.astype(dtype, inplace=True) + self.assertEqual(s.dtype, dtype) + self.assertEqual(s.name, 'foo') + def test_complexx(self): # GH4819 # complex access for ndarray compat From 58dd71b38af5ef142522c3f2bc6074967e177862 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Mon, 9 May 2016 22:25:14 -0400 Subject: [PATCH 02/96] ENH: NDFrame astype() now accepts inplace arg and dtype arg can be a mapping of col to type; GH7271 --- pandas/core/frame.py | 41 ------------------------- pandas/core/generic.py | 49 ++++++++++++++++++++++++------ pandas/tests/frame/test_dtypes.py | 31 +++++++++---------- pandas/tests/series/test_dtypes.py | 26 +++++++++++++--- 4 files changed, 75 insertions(+), 72 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e55f6f731ca8..b209b6d6ec543 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3772,47 +3772,6 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Misc methods - def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, - **kwargs): - """ - Cast object to given data type(s). - - Parameters - ---------- - dtype : numpy.dtype or Python type (to cast entire DataFrame to the - same type). Alternatively, {col: dtype, ...}, where col is a column - label and dtype is a numpy.dtype or Python type (to cast one or - more of the DataFrame's columns to column-specific types). - copy : deprecated; use inplace instead - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) - raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor if - inplace=False - - Returns - ------- - casted : type of caller - """ - if isinstance(dtype, collections.Mapping): - if inplace: - for col, typ in dtype.items(): - self[col].astype(typ, inplace=True, - raise_on_error=raise_on_error) - return None - else: - from pandas.tools.merge import concat - casted_cols = [self[col].astype(typ, copy=copy) - for col, typ in dtype.items()] - other_col_labels = self.columns.difference(dtype.keys()) - other_cols = [self[col].copy() if copy else self[col] - for col in other_col_labels] - new_df = concat(casted_cols + other_cols, axis=1) - return new_df.reindex(columns=self.columns, copy=False) - df = super(DataFrame, self) - return df.astype(dtype=dtype, copy=copy, inplace=inplace, - raise_on_error=raise_on_error, **kwargs) - def first_valid_index(self): """ Return label for first non-NA/null value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 79c80735ac65e..ca090634e524f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,4 +1,5 @@ # pylint: disable=W0231,E1101 +import collections import warnings import operator import weakref @@ -2937,26 +2938,54 @@ def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, Parameters ---------- - dtype : numpy.dtype or Python type + dtype : numpy.dtype or Python type (to cast entire DataFrame to the + same type). Alternatively, {col: dtype, ...}, where col is a column + label and dtype is a numpy.dtype or Python type (to cast one or + more of the DataFrame's columns to column-specific types). copy : deprecated; use inplace instead inplace : boolean, default False Modify the NDFrame in place (do not create a new object) raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor + kwargs : keyword arguments to pass on to the constructor if + inplace=False Returns ------- - casted : type of caller - """ + casted : type of caller (if inplace=False) or None (if inplace=True) + """ + if isinstance(dtype, collections.Mapping): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or list(dtype.keys())[0] != self.name: + if raise_on_error: + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + return + for key, value in dtype.items(): + return self.astype(value, copy, inplace, raise_on_error, + **kwargs) + + if inplace: + for col, typ in dtype.items(): + self[col].astype(typ, inplace=True, + raise_on_error=raise_on_error) + return + from pandas.tools.merge import concat + casted_cols = [self[col].astype(typ, copy=copy) + for col, typ in dtype.items()] + other_col_labels = self.columns.difference(dtype.keys()) + other_cols = [self[col].copy() if copy else self[col] + for col in other_col_labels] + new_df = concat(casted_cols + other_cols, axis=1) + return new_df.reindex(columns=self.columns, copy=False) + + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=not inplace, + raise_on_error=raise_on_error, **kwargs) if inplace: - new_data = self._data.astype(dtype=dtype, copy=False, - raise_on_error=raise_on_error, - **kwargs) self._update_inplace(new_data) return - mgr = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) - return self._constructor(mgr).__finalize__(self) + else: + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 22cb6890a3439..6c581298531fa 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -382,16 +382,14 @@ def test_astype_dict(self): original = df.copy(deep=True) # change type of a subset of columns + result = df.astype({'b': 'str', 'd': 'float32'}) expected = DataFrame({ 'a': a, 'b': Series(['0', '1', '2', '3', '4']), 'c': c, 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) - astyped = df.astype({'b': 'str', 'd': 'float32'}) - assert_frame_equal(astyped, expected) + assert_frame_equal(result, expected) assert_frame_equal(df, original) - self.assertEqual(astyped.b.dtype, 'object') - self.assertEqual(astyped.d.dtype, 'float32') # change all columns assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), @@ -412,21 +410,22 @@ def test_astype_dict(self): # using inplace=True, the df should be changed output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) self.assertEqual(output, None) assert_frame_equal(df, expected) - df.astype({'b': np.float32, 'c': 'float32', 'd': np.float32}, + + df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}, inplace=True) - self.assertEqual(df.a.dtype, original.a.dtype) - self.assertEqual(df.b.dtype, 'float32') - self.assertEqual(df.c.dtype, 'float32') - self.assertEqual(df.d.dtype, 'float32') - self.assertEqual(df.b[0], 0.0) - df.astype({'b': str, 'c': 'float64', 'd': np.float64}, inplace=True) - self.assertEqual(df.a.dtype, original.a.dtype) - self.assertEqual(df.b.dtype, 'object') - self.assertEqual(df.c.dtype, 'float64') - self.assertEqual(df.d.dtype, 'float64') - self.assertEqual(df.b[0], '0.0') + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(df, expected) def test_astype_inplace(self): # GH7271 diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index d90a08ec5518d..4d6f32af26e46 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,15 +133,31 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_dict(self): + s = Series(range(0, 10, 2), name='abc') + + result = s.astype({'abc': str}) + expected = Series(['0', '2', '4', '6', '8'], name='abc') + assert_series_equal(result, expected) + + result = s.astype({'abc': 'float64'}) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', + name='abc') + assert_series_equal(result, expected) + + self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) + self.assertRaises(KeyError, s.astype, {0: str}) + def test_astype_inplace(self): s = Series(np.random.randn(5), name='foo') + dtypes = ['float32', 'float64', 'int64', 'int32'] - for dtype in ['float32', 'float64', 'int64', 'int32']: - astyped = s.astype(dtype, inplace=False) - self.assertEqual(astyped.dtype, dtype) - self.assertEqual(astyped.name, s.name) + for dtype in dtypes: + result = s.astype(dtype, inplace=False) + self.assertEqual(result.dtype, dtype) + self.assertEqual(result.name, s.name) - for dtype in ['float32', 'float64', 'int64', 'int32']: + for dtype in dtypes: s.astype(dtype, inplace=True) self.assertEqual(s.dtype, dtype) self.assertEqual(s.name, 'foo') From 43989fd7cb9917e885c2b55a172c4f9f3838d59d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 10 May 2016 10:53:37 -0400 Subject: [PATCH 03/96] DOC: xref #13112, add back lexsorting example --- doc/source/advanced.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 7c7895a95310d..e50e792201d26 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -528,6 +528,13 @@ return a copy of the data rather than a view: jim joe 1 z 0.64094 +Furthermore if you try to index something that is not fully lexsorted, this can raise: + +.. code-block:: ipython + + In [5]: dfm.loc[(0,'y'):(1, 'z')] + KeyError: 'Key length (2) was greater than MultiIndex lexsort depth (1)' + The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and the ``lexsort_depth`` property returns the sort depth: .. ipython:: python @@ -542,6 +549,12 @@ The ``is_lexsorted()`` method on an ``Index`` show if the index is sorted, and t dfm.index.is_lexsorted() dfm.index.lexsort_depth +And now selection works as expected. + +.. ipython:: python + + dfm.loc[(0,'y'):(1, 'z')] + Take Methods ------------ From f0e47a9c9350e0d8fc0fe00a1ca0237582437e9d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 11 May 2016 09:32:25 -0400 Subject: [PATCH 04/96] COMPAT: boto import issues Author: Jeff Reback Closes #13136 from jreback/boto and squashes the following commits: dcb02d2 [Jeff Reback] COMPAT: boto import issues --- pandas/io/common.py | 109 ++-------------------------------- pandas/io/s3.py | 112 +++++++++++++++++++++++++++++++++++ pandas/io/tests/test_data.py | 3 - pandas/io/tests/test_s3.py | 14 +++++ 4 files changed, 130 insertions(+), 108 deletions(-) create mode 100644 pandas/io/s3.py create mode 100644 pandas/io/tests/test_s3.py diff --git a/pandas/io/common.py b/pandas/io/common.py index dc7c483c1fb68..cf4bba6e97afb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -104,85 +104,6 @@ def __next__(self): BaseIterator.next = lambda self: self.__next__() -try: - from boto.s3 import key - - class BotoFileLikeReader(key.Key): - """boto Key modified to be more file-like - - This modification of the boto Key will read through a supplied - S3 key once, then stop. The unmodified boto Key object will repeatedly - cycle through a file in S3: after reaching the end of the file, - boto will close the file. Then the next call to `read` or `next` will - re-open the file and start reading from the beginning. - - Also adds a `readline` function which will split the returned - values by the `\n` character. - """ - - def __init__(self, *args, **kwargs): - encoding = kwargs.pop("encoding", None) # Python 2 compat - super(BotoFileLikeReader, self).__init__(*args, **kwargs) - # Add a flag to mark the end of the read. - self.finished_read = False - self.buffer = "" - self.lines = [] - if encoding is None and compat.PY3: - encoding = "utf-8" - self.encoding = encoding - self.lines = [] - - def next(self): - return self.readline() - - __next__ = next - - def read(self, *args, **kwargs): - if self.finished_read: - return b'' if compat.PY3 else '' - return super(BotoFileLikeReader, self).read(*args, **kwargs) - - def close(self, *args, **kwargs): - self.finished_read = True - return super(BotoFileLikeReader, self).close(*args, **kwargs) - - def seekable(self): - """Needed for reading by bz2""" - return False - - def readline(self): - """Split the contents of the Key by '\n' characters.""" - if self.lines: - retval = self.lines[0] - self.lines = self.lines[1:] - return retval - if self.finished_read: - if self.buffer: - retval, self.buffer = self.buffer, "" - return retval - else: - raise StopIteration - - if self.encoding: - self.buffer = "{}{}".format( - self.buffer, self.read(8192).decode(self.encoding)) - else: - self.buffer = "{}{}".format(self.buffer, self.read(8192)) - - split_buffer = self.buffer.split("\n") - self.lines.extend(split_buffer[:-1]) - self.buffer = split_buffer[-1] - - return self.readline() -except ImportError: - # boto is only needed for reading from S3. - pass -except TypeError: - # boto/boto3 issues - # GH11915 - pass - - def _is_url(url): """Check to see if a URL has a valid protocol. @@ -319,32 +240,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, return tuple(to_return) if _is_s3_url(filepath_or_buffer): - try: - import boto - except: - raise ImportError("boto is required to handle s3 files") - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST - # are environment variables - parsed_url = parse_url(filepath_or_buffer) - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') - - try: - conn = boto.connect_s3(host=s3_host) - except boto.exception.NoAuthHandlerFound: - conn = boto.connect_s3(host=s3_host, anon=True) - - b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): - k = boto.s3.key.Key(b, parsed_url.path) - filepath_or_buffer = BytesIO(k.get_contents_as_string( - encoding=encoding)) - else: - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) - k.open('r') # Expose read errors immediately - filepath_or_buffer = k - return filepath_or_buffer, None, compression + from pandas.io.s3 import get_filepath_or_buffer + return get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) diff --git a/pandas/io/s3.py b/pandas/io/s3.py new file mode 100644 index 0000000000000..df8f1d9187031 --- /dev/null +++ b/pandas/io/s3.py @@ -0,0 +1,112 @@ +""" s3 support for remote file interactivity """ + +import os +from pandas import compat +from pandas.compat import BytesIO + +try: + import boto + from boto.s3 import key +except: + raise ImportError("boto is required to handle s3 files") + +if compat.PY3: + from urllib.parse import urlparse as parse_url +else: + from urlparse import urlparse as parse_url + + +class BotoFileLikeReader(key.Key): + """boto Key modified to be more file-like + + This modification of the boto Key will read through a supplied + S3 key once, then stop. The unmodified boto Key object will repeatedly + cycle through a file in S3: after reaching the end of the file, + boto will close the file. Then the next call to `read` or `next` will + re-open the file and start reading from the beginning. + + Also adds a `readline` function which will split the returned + values by the `\n` character. + """ + + def __init__(self, *args, **kwargs): + encoding = kwargs.pop("encoding", None) # Python 2 compat + super(BotoFileLikeReader, self).__init__(*args, **kwargs) + # Add a flag to mark the end of the read. + self.finished_read = False + self.buffer = "" + self.lines = [] + if encoding is None and compat.PY3: + encoding = "utf-8" + self.encoding = encoding + self.lines = [] + + def next(self): + return self.readline() + + __next__ = next + + def read(self, *args, **kwargs): + if self.finished_read: + return b'' if compat.PY3 else '' + return super(BotoFileLikeReader, self).read(*args, **kwargs) + + def close(self, *args, **kwargs): + self.finished_read = True + return super(BotoFileLikeReader, self).close(*args, **kwargs) + + def seekable(self): + """Needed for reading by bz2""" + return False + + def readline(self): + """Split the contents of the Key by '\n' characters.""" + if self.lines: + retval = self.lines[0] + self.lines = self.lines[1:] + return retval + if self.finished_read: + if self.buffer: + retval, self.buffer = self.buffer, "" + return retval + else: + raise StopIteration + + if self.encoding: + self.buffer = "{}{}".format( + self.buffer, self.read(8192).decode(self.encoding)) + else: + self.buffer = "{}{}".format(self.buffer, self.read(8192)) + + split_buffer = self.buffer.split("\n") + self.lines.extend(split_buffer[:-1]) + self.buffer = split_buffer[-1] + + return self.readline() + + +def get_filepath_or_buffer(filepath_or_buffer, encoding=None, + compression=None): + + # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST + # are environment variables + parsed_url = parse_url(filepath_or_buffer) + s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') + + try: + conn = boto.connect_s3(host=s3_host) + except boto.exception.NoAuthHandlerFound: + conn = boto.connect_s3(host=s3_host, anon=True) + + b = conn.get_bucket(parsed_url.netloc, validate=False) + if compat.PY2 and (compression == 'gzip' or + (compression == 'infer' and + filepath_or_buffer.endswith(".gz"))): + k = boto.s3.key.Key(b, parsed_url.path) + filepath_or_buffer = BytesIO(k.get_contents_as_string( + encoding=encoding)) + else: + k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) + k.open('r') # Expose read errors immediately + filepath_or_buffer = k + return filepath_or_buffer, None, compression diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index d9c09fa788332..6845eb009df5d 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -472,9 +472,6 @@ def test_options_source_warning(self): class TestDataReader(tm.TestCase): - def test_is_s3_url(self): - from pandas.io.common import _is_s3_url - self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) @network def test_read_yahoo(self): diff --git a/pandas/io/tests/test_s3.py b/pandas/io/tests/test_s3.py new file mode 100644 index 0000000000000..8058698a906ea --- /dev/null +++ b/pandas/io/tests/test_s3.py @@ -0,0 +1,14 @@ +import nose +from pandas.util import testing as tm + +from pandas.io.common import _is_s3_url + + +class TestS3URL(tm.TestCase): + def test_is_s3_url(self): + self.assertTrue(_is_s3_url("s3://pandas/somethingelse.com")) + self.assertFalse(_is_s3_url("s4://pandas/somethingelse.com")) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From d0734ba4d0f4c228110dc3974943ce4ec2adeea4 Mon Sep 17 00:00:00 2001 From: Yadunandan Date: Wed, 11 May 2016 18:13:30 -0400 Subject: [PATCH 05/96] BUG: Added checks for NaN in __call__ of EngFormatter closes #11981 Author: Yadunandan Closes #13124 from yaduart/bugfix-11981 and squashes the following commits: 8de1f64 [Yadunandan] BUG: Added checks for Nan in __call__ of EngFormatter --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/formats/format.py | 3 +++ pandas/tests/formats/test_format.py | 19 +++++++++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index fa426aa30bc65..5ffbce9867121 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -132,3 +132,4 @@ Bug Fixes - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) diff --git a/pandas/formats/format.py b/pandas/formats/format.py index c3ffc018d1031..70b506a1415c1 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -2590,6 +2590,9 @@ def __call__(self, num): import math dnum = decimal.Decimal(str(num)) + if decimal.Decimal.is_nan(dnum): + return 'NaN' + sign = 1 if dnum < 0: # pragma: no cover diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 4fcee32c46067..96770a86ff383 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3087,11 +3087,11 @@ def test_to_csv_doublequote(self): def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) - expected = """\ + expected = '''\ "","col" "0","a\\"a" "1","\\"bb\\"" -""" +''' with tm.ensure_clean('test.csv') as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') @@ -3925,6 +3925,21 @@ def test_rounding(self): result = formatter(0) self.assertEqual(result, u(' 0.000')) + def test_nan(self): + # Issue #11981 + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + result = formatter(np.nan) + self.assertEqual(result, u('NaN')) + + df = pd.DataFrame({'a':[1.5, 10.3, 20.5], + 'b':[50.3, 60.67, 70.12], + 'c':[100.2, 101.33, 120.33]}) + pt = df.pivot_table(values='a', index='b', columns='c') + fmt.set_eng_float_format(accuracy=1) + result = pt.to_string() + self.assertTrue('NaN' in result) + self.reset_display_options() def _three_digit_exp(): return '%.4g' % 1.7e8 == '1.7e+008' From 2a99394bf96415a5b525e6db206a04d3d2ff68c3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 11 May 2016 18:41:01 -0400 Subject: [PATCH 06/96] TST: fix assert_categorical_equal message stage 1 of #13076 Author: sinhrks Closes #13080 from sinhrks/test_categorical_message and squashes the following commits: 81172ce [sinhrks] TST: fix assert_categorical_equal message --- pandas/core/categorical.py | 2 +- pandas/tests/series/test_datetime_values.py | 2 - pandas/tests/test_testing.py | 72 ++++++++++++++++----- pandas/util/testing.py | 19 +++--- 4 files changed, 65 insertions(+), 30 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 4f80c610c1126..44c91862227d8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -985,7 +985,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_codes' not in state and 'labels' in state: - state['_codes'] = state.pop('labels') + state['_codes'] = state.pop('labels').astype(np.int8) if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 5b12baf6c6fc5..6e82f81f901a9 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -320,8 +320,6 @@ def test_strftime(self): expected = np.array(['2015/03/01', '2015/03/02', '2015/03/03', '2015/03/04', '2015/03/05'], dtype=np.object_) # dtype may be S10 or U10 depending on python version - print(result) - print(expected) self.assert_numpy_array_equal(result, expected, check_dtype=False) period_index = period_range('20150301', periods=5) diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 9294bccce013f..357d53cb58c72 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -65,9 +65,8 @@ def test_assert_almost_equal_dicts(self): self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) - self._assert_not_almost_equal_both( - {'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 3} - ) + self._assert_not_almost_equal_both({'a': 1, 'b': 2}, + {'a': 1, 'b': 2, 'c': 3}) self._assert_not_almost_equal_both({'a': 1}, 1) self._assert_not_almost_equal_both({'a': 1}, 'abc') self._assert_not_almost_equal_both({'a': 1}, [1, ]) @@ -215,11 +214,11 @@ def test_numpy_array_equal_message(self): \\[right\\]: \\[1\\.0, nan, 3\\.0\\]""" with assertRaisesRegexp(AssertionError, expected): - assert_numpy_array_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_numpy_array_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) with assertRaisesRegexp(AssertionError, expected): - assert_almost_equal( - np.array([np.nan, 2, 3]), np.array([1, np.nan, 3])) + assert_almost_equal(np.array([np.nan, 2, 3]), + np.array([1, np.nan, 3])) expected = """numpy array are different @@ -339,8 +338,8 @@ def test_index_equal_message(self): labels=\\[\\[0, 0, 1, 1\\], \\[0, 1, 2, 3\\]\\]\\)""" idx1 = pd.Index([1, 2, 3]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2, exact=False) @@ -350,10 +349,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -434,10 +433,10 @@ def test_index_equal_message(self): \\[left\\]: Int64Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\) \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" - idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), ('B', 3), ('B', 4 - )]) - idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), ('B', 3), ('B', 4 - )]) + idx1 = pd.MultiIndex.from_tuples([('A', 2), ('A', 2), + ('B', 3), ('B', 4)]) + idx2 = pd.MultiIndex.from_tuples([('A', 1), ('A', 2), + ('B', 3), ('B', 4)]) with assertRaisesRegexp(AssertionError, expected): assert_index_equal(idx1, idx2) with assertRaisesRegexp(AssertionError, expected): @@ -674,6 +673,45 @@ def test_notisinstance(self): tm.assertNotIsInstance(pd.Series([1]), pd.Series) +class TestAssertCategoricalEqual(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_categorical_equal_message(self): + + expected = """Categorical\\.categories are different + +Categorical\\.categories values are different \\(25\\.0 %\\) +\\[left\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\) +\\[right\\]: Int64Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)""" + + a = pd.Categorical([1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 5]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical\\.codes are different + +Categorical\\.codes values are different \\(50\\.0 %\\) +\\[left\\]: \\[0, 1, 3, 2\\] +\\[right\\]: \\[0, 1, 2, 3\\]""" + + a = pd.Categorical([1, 2, 4, 3], categories=[1, 2, 3, 4]) + b = pd.Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + expected = """Categorical are different + +Attribute "ordered" are different +\\[left\\]: False +\\[right\\]: True""" + + a = pd.Categorical([1, 2, 3, 4], ordered=False) + b = pd.Categorical([1, 2, 3, 4], ordered=True) + with assertRaisesRegexp(AssertionError, expected): + tm.assert_categorical_equal(a, b) + + class TestRNGContext(unittest.TestCase): def test_RNGContext(self): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 3ea4a09c453ee..8682302b542be 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -903,18 +903,17 @@ def assertNotIsInstance(obj, cls, msg=''): raise AssertionError(err_msg.format(msg, cls)) -def assert_categorical_equal(res, exp): - assertIsInstance(res, pd.Categorical, '[Categorical] ') - assertIsInstance(exp, pd.Categorical, '[Categorical] ') +def assert_categorical_equal(left, right, check_dtype=True, + obj='Categorical'): + assertIsInstance(left, pd.Categorical, '[Categorical] ') + assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(res.categories, exp.categories) + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) - if not array_equivalent(res.codes, exp.codes): - raise AssertionError( - 'codes not equivalent: {0} vs {1}.'.format(res.codes, exp.codes)) - - if res.ordered != exp.ordered: - raise AssertionError("ordered not the same") + assert_attr_equal('ordered', left, right, obj=obj) def raise_assert_detail(obj, message, left, right): From 4aa6323e7d72fe00417d8aab783a5f78cf497018 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 3 May 2016 16:30:40 +0900 Subject: [PATCH 07/96] BUG: Series ops with object dtype may incorrectly fail closes #13043 closes #13072 --- doc/source/whatsnew/v0.18.2.txt | 14 ++++++ pandas/core/ops.py | 24 ++++++++-- pandas/tseries/tests/test_period.py | 62 +++++++++++++++++++++++++ pandas/tseries/tests/test_timedeltas.py | 32 +++++++++++++ 4 files changed, 128 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 5ffbce9867121..34bd2956319fc 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -128,8 +128,22 @@ Bug Fixes +- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) + - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) + + + + + + + + + + + + diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 63fea71895da2..b02f94cc92e22 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -19,6 +19,7 @@ from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing +import pandas.algos as _algos import pandas.core.algorithms as algos from pandas.core.common import (is_list_like, notnull, isnull, _values_from_object, _maybe_match_name, @@ -600,6 +601,21 @@ def na_op(x, y): result = missing.fill_zeros(result, x, y, name, fill_zeros) return result + def safe_na_op(lvalues, rvalues): + try: + return na_op(lvalues, rvalues) + except Exception: + if isinstance(rvalues, ABCSeries): + if is_object_dtype(rvalues): + # if dtype is object, try elementwise op + return _algos.arrmap_object(rvalues, + lambda x: op(lvalues, x)) + else: + if is_object_dtype(lvalues): + return _algos.arrmap_object(lvalues, + lambda x: op(x, rvalues)) + raise + def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): @@ -638,9 +654,8 @@ def wrapper(left, right, name=name, na_op=na_op): if ridx is not None: rvalues = algos.take_1d(rvalues, ridx) - arr = na_op(lvalues, rvalues) - - return left._constructor(wrap_results(arr), index=index, + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=index, name=name, dtype=dtype) else: # scalars @@ -648,7 +663,8 @@ def wrapper(left, right, name=name, na_op=na_op): not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values - return left._constructor(wrap_results(na_op(lvalues, rvalues)), + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=left.index, name=left.name, dtype=dtype) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 740a158c52f87..4217cc9a299a3 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -4151,6 +4151,68 @@ def test_intercept_astype_object(self): result = df.values.squeeze() self.assertTrue((result[:, 0] == expected.values).all()) + def test_ops_series_timedelta(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Period('2015-01-02', freq='D'), + pd.Period('2015-01-03', freq='D')], name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + tm.assert_series_equal(s + pd.tseries.offsets.Day(), exp) + tm.assert_series_equal(pd.tseries.offsets.Day() + s, exp) + + def test_ops_series_period(self): + # GH 13043 + s = pd.Series([pd.Period('2015-01-01', freq='D'), + pd.Period('2015-01-02', freq='D')], name='xxx') + self.assertEqual(s.dtype, object) + + p = pd.Period('2015-01-10', freq='D') + # dtype will be object because of original dtype + exp = pd.Series([9, 8], name='xxx', dtype=object) + tm.assert_series_equal(p - s, exp) + tm.assert_series_equal(s - p, -exp) + + s2 = pd.Series([pd.Period('2015-01-05', freq='D'), + pd.Period('2015-01-04', freq='D')], name='xxx') + self.assertEqual(s2.dtype, object) + + exp = pd.Series([4, 2], name='xxx', dtype=object) + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + def test_ops_frame_period(self): + # GH 13043 + df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), + pd.Period('2015-02', freq='M')], + 'B': [pd.Period('2014-01', freq='M'), + pd.Period('2014-02', freq='M')]}) + self.assertEqual(df['A'].dtype, object) + self.assertEqual(df['B'].dtype, object) + + p = pd.Period('2015-03', freq='M') + # dtype will be object because of original dtype + exp = pd.DataFrame({'A': np.array([2, 1], dtype=object), + 'B': np.array([14, 13], dtype=object)}) + tm.assert_frame_equal(p - df, exp) + tm.assert_frame_equal(df - p, -exp) + + df2 = pd.DataFrame({'A': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')], + 'B': [pd.Period('2015-05', freq='M'), + pd.Period('2015-06', freq='M')]}) + self.assertEqual(df2['A'].dtype, object) + self.assertEqual(df2['B'].dtype, object) + + exp = pd.DataFrame({'A': np.array([4, 4], dtype=object), + 'B': np.array([16, 16], dtype=object)}) + tm.assert_frame_equal(df2 - df, exp) + tm.assert_frame_equal(df - df2, -exp) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index c764f34b697c1..8474bbbc91931 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -413,6 +413,38 @@ def test_ops_series(self): tm.assert_series_equal(expected, td * other) tm.assert_series_equal(expected, other * td) + def test_ops_series_object(self): + # GH 13043 + s = pd.Series([pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-01', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='Asia/Tokyo')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('1 days'), exp) + tm.assert_series_equal(pd.Timedelta('1 days') + s, exp) + + # object series & object series + s2 = pd.Series([pd.Timestamp('2015-01-03', tz='US/Eastern'), + pd.Timestamp('2015-01-05', tz='Asia/Tokyo')], + name='xxx') + self.assertEqual(s2.dtype, object) + exp = pd.Series([pd.Timedelta('2 days'), pd.Timedelta('4 days')], + name='xxx') + tm.assert_series_equal(s2 - s, exp) + tm.assert_series_equal(s - s2, -exp) + + s = pd.Series([pd.Timedelta('01:00:00'), pd.Timedelta('02:00:00')], + name='xxx', dtype=object) + self.assertEqual(s.dtype, object) + + exp = pd.Series([pd.Timedelta('01:30:00'), pd.Timedelta('02:30:00')], + name='xxx') + tm.assert_series_equal(s + pd.Timedelta('00:30:00'), exp) + tm.assert_series_equal(pd.Timedelta('00:30:00') + s, exp) + def test_compare_timedelta_series(self): # regresssion test for GH5963 s = pd.Series([timedelta(days=1), timedelta(days=2)]) From 4de83d25d751d8ca102867b2d46a5547c01d7248 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 12 May 2016 09:08:51 -0400 Subject: [PATCH 08/96] PERF: quantile now operates per block boosting perf / fix quantile with nan closes #11623 closes #13098 Author: Jeff Reback Closes #13122 from jreback/quantile and squashes the following commits: aad72cb [Jeff Reback] PERF: quantile now operates per block boosting perf REGR: series quantile with nan --- asv_bench/benchmarks/frame_methods.py | 13 +- codecov.yml | 3 +- doc/source/whatsnew/v0.18.1.txt | 1 - doc/source/whatsnew/v0.18.2.txt | 16 +- pandas/core/frame.py | 34 ++-- pandas/core/internals.py | 273 ++++++++++++++++++++++---- pandas/core/series.py | 21 +- pandas/io/pytables.py | 10 +- pandas/src/inference.pyx | 27 ++- pandas/tests/frame/test_quantile.py | 55 ++++-- pandas/tests/series/test_quantile.py | 8 + pandas/tests/test_groupby.py | 6 +- 12 files changed, 352 insertions(+), 115 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 9367c42f8d39a..5c5a1df4ea1f8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -423,7 +423,7 @@ class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = pandas.DataFrame(np.random.randn(10, 10000)) + self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() @@ -985,3 +985,14 @@ def setup(self): def time_series_string_vector_slice(self): self.s.str[:5] + + +class frame_quantile_axis1(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 3), + columns=list('ABC')) + + def time_frame_quantile_axis1(self): + self.df.quantile([0.1, 0.5], axis=1) diff --git a/codecov.yml b/codecov.yml index edf2d821e07e5..86e7dd55c9550 100644 --- a/codecov.yml +++ b/codecov.yml @@ -9,4 +9,5 @@ coverage: branches: null changes: default: - branches: null + branches: + - master diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 7f837bef5251c..51982c42499ff 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -563,7 +563,6 @@ Performance Improvements - Improved speed of SAS reader (:issue:`12656`, :issue:`12961`) - Performance improvements in ``.groupby(..).cumcount()`` (:issue:`11039`) - Improved memory usage in ``pd.read_csv()`` when using ``skiprows=an_integer`` (:issue:`13005`) - - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - Improved performance of ``Period`` construction and time series plotting (:issue:`12903`, :issue:`11831`). - Improved performance of ``.str.encode()`` and ``.str.decode()`` methods (:issue:`13008`) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 34bd2956319fc..85209c0dfa03d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -97,6 +97,9 @@ Performance Improvements - Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) - Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) + + @@ -110,6 +113,7 @@ Bug Fixes +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) @@ -135,15 +139,3 @@ Bug Fixes - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - - - - - - - - - - - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b209b6d6ec543..3bf442349ef04 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4989,31 +4989,27 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.5 2.5 55.0 """ self._check_percentile(q) - if not com.is_list_like(q): - q = [q] - squeeze = True - else: - squeeze = False data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) + is_transposed = axis == 1 - def _quantile(series): - res = series.quantile(q, interpolation=interpolation) - return series.name, res - - if axis == 1: + if is_transposed: data = data.T - # unable to use DataFrame.apply, becasuse data may be empty - result = dict(_quantile(s) for (_, s) in data.iteritems()) - result = self._constructor(result, columns=data.columns) - if squeeze: - if result.shape == (1, 1): - result = result.T.iloc[:, 0] # don't want scalar - else: - result = result.T.squeeze() - result.name = None # For groupby, so it can set an index name + result = data._data.quantile(qs=q, + axis=1, + interpolation=interpolation, + transposed=is_transposed) + + if result.ndim == 2: + result = self._constructor(result) + else: + result = self._constructor_sliced(result, name=q) + + if is_transposed: + result = result.T + return result def to_timestamp(self, freq=None, how='start', axis=0, copy=True): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index abfc5c989056e..97df81ad6be48 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -40,7 +40,7 @@ from pandas.util.decorators import cache_readonly from pandas.tslib import Timedelta -from pandas import compat +from pandas import compat, _np_version_under1p9 from pandas.compat import range, map, zip, u from pandas.lib import BlockPlacement @@ -84,7 +84,7 @@ def __init__(self, values, placement, ndim=None, fastpath=False): self.mgr_locs = placement self.values = values - if len(self.mgr_locs) != len(self.values): + if ndim and len(self.mgr_locs) != len(self.values): raise ValueError('Wrong number of items passed %d, placement ' 'implies %d' % (len(self.values), len(self.mgr_locs))) @@ -180,6 +180,12 @@ def make_block(self, values, placement=None, ndim=None, **kwargs): return make_block(values, placement=placement, ndim=ndim, **kwargs) + def make_block_scalar(self, values, **kwargs): + """ + Create a ScalarBlock + """ + return ScalarBlock(values) + def make_block_same_class(self, values, placement=None, fastpath=True, **kwargs): """ Wrap given values in a block of same type as self. """ @@ -324,7 +330,8 @@ def apply(self, func, mgr=None, **kwargs): """ result = func(self.values, **kwargs) if not isinstance(result, Block): - result = self.make_block(values=_block_shape(result)) + result = self.make_block(values=_block_shape(result, + ndim=self.ndim)) return result @@ -1260,32 +1267,117 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) - def quantile(self, qs, mgr=None, **kwargs): + def quantile(self, qs, interpolation='linear', axis=0, mgr=None): """ compute the quantiles of the Parameters ---------- - qs : a scalar or list of the quantiles to be computed + qs: a scalar or list of the quantiles to be computed + interpolation: type of interpolation, default 'linear' + axis: axis to compute, default 0 + + Returns + ------- + tuple of (axis, block) + """ + if _np_version_under1p9: + if interpolation != 'linear': + raise ValueError("Interpolation methods other than linear " + "are not supported in numpy < 1.9.") + + kw = {} + if not _np_version_under1p9: + kw.update({'interpolation': interpolation}) values = self.get_values() - values, mask, _, _ = self._try_coerce_args(values, values) + values, _, _, _ = self._try_coerce_args(values, values) + mask = isnull(self.values) if not lib.isscalar(mask) and mask.any(): - values = values[~mask] - if len(values) == 0: - if com.is_list_like(qs): - result = np.array([self.fill_value]) + # even though this could be a 2-d mask it appears + # as a 1-d result + mask = mask.reshape(values.shape) + result_shape = tuple([values.shape[0]] + [-1] * (self.ndim - 1)) + values = _block_shape(values[~mask], ndim=self.ndim) + if self.ndim > 1: + values = values.reshape(result_shape) + + from pandas import Float64Index + is_empty = values.shape[axis] == 0 + if com.is_list_like(qs): + ax = Float64Index(qs) + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + # create the array of na_values + # 2d len(values) * len(qs) + result = np.repeat(np.array([self._na_value] * len(qs)), + len(values)).reshape(len(values), + len(qs)) else: - result = self._na_value - elif com.is_list_like(qs): - values = [_quantile(values, x * 100, **kwargs) for x in qs] - result = np.array(values) + + try: + result = _quantile(values, np.array(qs) * 100, + axis=axis, **kw) + except ValueError: + + # older numpies don't handle an array for q + result = [_quantile(values, q * 100, + axis=axis, **kw) for q in qs] + + result = np.array(result, copy=False) + if self.ndim > 1: + result = result.T + else: - result = _quantile(values, qs * 100, **kwargs) - return self._try_coerce_result(result) + if self.ndim == 1: + ax = Float64Index([qs]) + else: + ax = mgr.axes[0] + + if is_empty: + if self.ndim == 1: + result = self._na_value + else: + result = np.array([self._na_value] * len(self)) + else: + result = _quantile(values, qs * 100, axis=axis, **kw) + + ndim = getattr(result, 'ndim', None) or 0 + result = self._try_coerce_result(result) + if lib.isscalar(result): + return ax, self.make_block_scalar(result) + return ax, make_block(result, + placement=np.arange(len(result)), + ndim=ndim) + + +class ScalarBlock(Block): + """ + a scalar compat Block + """ + __slots__ = ['_mgr_locs', 'values', 'ndim'] + + def __init__(self, values): + self.ndim = 0 + self.mgr_locs = [0] + self.values = values + + @property + def dtype(self): + return type(self.values) + + @property + def shape(self): + return tuple([0]) + + def __len__(self): + return 0 class NonConsolidatableMixIn(object): @@ -1378,6 +1470,8 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] + + mask = mask.reshape(new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1676,6 +1770,7 @@ def convert(self, *args, **kwargs): can return multiple blocks! """ + if args: raise NotImplementedError by_item = True if 'by_item' not in kwargs else kwargs['by_item'] @@ -1706,8 +1801,13 @@ def convert(self, *args, **kwargs): for i, rl in enumerate(self.mgr_locs): values = self.iget(i) - values = fn(values.ravel(), **fn_kwargs).reshape(values.shape) - values = _block_shape(values, ndim=self.ndim) + shape = values.shape + values = fn(values.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except AttributeError: + pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -2115,7 +2215,10 @@ def _try_coerce_result(self, result): """ reverse of try_coerce_args """ if isinstance(result, np.ndarray): if result.dtype.kind in ['i', 'f', 'O']: - result = result.astype('M8[ns]') + try: + result = result.astype('M8[ns]') + except ValueError: + pass elif isinstance(result, (np.integer, np.float, np.datetime64)): result = self._box_func(result) return result @@ -2219,11 +2322,6 @@ def to_object_block(self, mgr): kwargs['placement'] = [0] return self.make_block(values, klass=ObjectBlock, **kwargs) - def replace(self, *args, **kwargs): - # if we are forced to ObjectBlock, then don't coerce (to UTC) - kwargs['convert'] = False - return super(DatetimeTZBlock, self).replace(*args, **kwargs) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2246,8 +2344,8 @@ def _try_coerce_args(self, values, other): ------- base-type values, values mask, base-type other, other mask """ - values_mask = isnull(values) - values = values.tz_localize(None).asi8 + values_mask = _block_shape(isnull(values), ndim=self.ndim) + values = _block_shape(values.tz_localize(None).asi8, ndim=self.ndim) other_mask = False if isinstance(other, ABCSeries): @@ -2283,6 +2381,9 @@ def _try_coerce_result(self, result): elif isinstance(result, (np.integer, np.float, np.datetime64)): result = lib.Timestamp(result).tz_localize(self.values.tz) if isinstance(result, np.ndarray): + # allow passing of > 1dim if its trivial + if result.ndim > 1: + result = result.reshape(len(result)) result = self._holder(result).tz_localize(self.values.tz) return result @@ -2809,7 +2910,7 @@ def _verify_integrity(self): len(self.items), tot_items)) def apply(self, f, axes=None, filter=None, do_integrity_check=False, - consolidate=True, raw=False, **kwargs): + consolidate=True, **kwargs): """ iterate over the blocks, collect and create a new block manager @@ -2823,7 +2924,6 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, integrity check consolidate: boolean, default True. Join together blocks having same dtype - raw: boolean, default False. Return the raw returned results Returns ------- @@ -2890,17 +2990,102 @@ def apply(self, f, axes=None, filter=None, do_integrity_check=False, applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) - if raw: - if self._is_single_block: - return result_blocks[0] - return result_blocks - elif len(result_blocks) == 0: + if len(result_blocks) == 0: return self.make_empty(axes or self.axes) bm = self.__class__(result_blocks, axes or self.axes, do_integrity_check=do_integrity_check) bm._consolidate_inplace() return bm + def reduction(self, f, axis=0, consolidate=True, transposed=False, + **kwargs): + """ + iterate over the blocks, collect and create a new block manager. + This routine is intended for reduction type operations and + will do inference on the generated blocks. + + Parameters + ---------- + f: the callable or function name to operate on at the block level + axis: reduction axis, default 0 + consolidate: boolean, default True. Join together blocks having same + dtype + transposed: boolean, default False + we are holding transposed data + + Returns + ------- + Block Manager (new object) + + """ + + if consolidate: + self._consolidate_inplace() + + axes, blocks = [], [] + for b in self.blocks: + kwargs['mgr'] = self + axe, block = getattr(b, f)(axis=axis, **kwargs) + + axes.append(axe) + blocks.append(block) + + # note that some DatetimeTZ, Categorical are always ndim==1 + ndim = set([b.ndim for b in blocks]) + + if 2 in ndim: + + new_axes = list(self.axes) + + # multiple blocks that are reduced + if len(blocks) > 1: + new_axes[1] = axes[0] + + # reset the placement to the original + for b, sb in zip(blocks, self.blocks): + b.mgr_locs = sb.mgr_locs + + else: + new_axes[axis] = Index(np.concatenate( + [ax.values for ax in axes])) + + if transposed: + new_axes = new_axes[::-1] + blocks = [b.make_block(b.values.T, + placement=np.arange(b.shape[1]) + ) for b in blocks] + + return self.__class__(blocks, new_axes) + + # 0 ndim + if 0 in ndim and 1 not in ndim: + values = np.array([b.values for b in blocks]) + if len(values) == 1: + return values.item() + blocks = [make_block(values, ndim=1)] + axes = Index([ax[0] for ax in axes]) + + # single block + values = _concat._concat_compat([b.values for b in blocks]) + + # compute the orderings of our original data + if len(self.blocks) > 1: + + indexer = np.empty(len(self.axes[0]), dtype='int64') + i = 0 + for b in self.blocks: + for j in b.mgr_locs: + indexer[j] = i + i = i + 1 + + values = values.take(indexer) + + return SingleBlockManager( + [make_block(values, + ndim=1, + placement=np.arange(len(values)))], + axes[0]) + def isnull(self, **kwargs): return self.apply('apply', **kwargs) @@ -2911,7 +3096,7 @@ def eval(self, **kwargs): return self.apply('eval', **kwargs) def quantile(self, **kwargs): - return self.apply('quantile', raw=True, **kwargs) + return self.reduction('quantile', **kwargs) def setitem(self, **kwargs): return self.apply('setitem', **kwargs) @@ -3068,7 +3253,6 @@ def combine(self, blocks, copy=True): indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_items = self.items.take(indexer) new_blocks = [] for b in blocks: @@ -3077,9 +3261,10 @@ def combine(self, blocks, copy=True): axis=0, allow_fill=False) new_blocks.append(b) - new_axes = list(self.axes) - new_axes[0] = new_items - return self.__class__(new_blocks, new_axes, do_integrity_check=False) + axes = list(self.axes) + axes[0] = self.items.take(indexer) + + return self.__class__(new_blocks, axes, do_integrity_check=False) def get_slice(self, slobj, axis=0): if axis >= self.ndim: @@ -3829,6 +4014,16 @@ def _block(self): def _values(self): return self._block.values + @property + def _blknos(self): + """ compat with BlockManager """ + return None + + @property + def _blklocs(self): + """ compat with BlockManager """ + return None + def reindex(self, new_axis, indexer=None, method=None, fill_value=None, limit=None, copy=True): # if we are the same and don't copy, just return @@ -4317,7 +4512,7 @@ def _extend_blocks(result, blocks=None): def _block_shape(values, ndim=1, shape=None): """ guarantee the shape of the values to be at least 1 d """ - if values.ndim <= ndim: + if values.ndim < ndim: if shape is None: shape = values.shape values = values.reshape(tuple((1, ) + shape)) diff --git a/pandas/core/series.py b/pandas/core/series.py index 58e983ad904ba..43b4ba3a51212 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -57,8 +57,6 @@ from pandas.core.config import get_option -from pandas import _np_version_under1p9 - __all__ = ['Series'] _shared_doc_kwargs = dict( @@ -1349,21 +1347,12 @@ def quantile(self, q=0.5, interpolation='linear'): self._check_percentile(q) - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9.") - - kwargs = dict() - if not _np_version_under1p9: - kwargs.update({'interpolation': interpolation}) + result = self._data.quantile(qs=q, interpolation=interpolation) - result = self._data.quantile(qs=q, **kwargs) - - if com.is_list_like(result): - # explicitly use Float64Index to coerce empty result to float dtype - index = Float64Index(q) - return self._constructor(result, index=index, name=self.name) + if com.is_list_like(q): + return self._constructor(result, + index=Float64Index(q), + name=self.name) else: # scalar return result diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index dff2c6f0df7b1..318fd17b8f88e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3826,24 +3826,24 @@ def write_data(self, chunksize, dropna=False): nrows = self.nrows_expected # if dropna==True, then drop ALL nan rows + masks = [] if dropna: - masks = [] for a in self.values_axes: # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask mask = com.isnull(a.data).all(axis=0) - masks.append(mask.astype('u1', copy=False)) + if isinstance(mask, np.ndarray): + masks.append(mask.astype('u1', copy=False)) - # consolidate masks + # consolidate masks + if len(masks): mask = masks[0] for m in masks[1:]: mask = mask & m mask = mask.ravel() - else: - mask = None # broadcast the indexes if needed diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 843031fafa1a9..3ccc1c4f9336c 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -642,6 +642,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_float = 0 bint seen_complex = 0 bint seen_datetime = 0 + bint seen_datetimetz = 0 bint seen_timedelta = 0 bint seen_int = 0 bint seen_bool = 0 @@ -675,6 +676,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if val is None: seen_null = 1 floats[i] = complexes[i] = fnan + elif val is NaT: + if convert_datetime: + idatetimes[i] = iNaT + seen_datetime = 1 + if convert_timedelta: + itimedeltas[i] = iNaT + seen_timedelta = 1 + if not (convert_datetime or convert_timedelta): + seen_object = 1 elif util.is_bool_object(val): seen_bool = 1 bools[i] = val @@ -710,9 +720,15 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, complexes[i] = val seen_complex = 1 elif PyDateTime_Check(val) or util.is_datetime64_object(val): + + # if we have an tz's attached then return the objects if convert_datetime: - seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + if getattr(val, 'tzinfo', None) is not None: + seen_datetimetz = 1 + break + else: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value else: seen_object = 1 break @@ -731,6 +747,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_numeric = seen_complex or seen_float or seen_int + # we try to coerce datetime w/tz but must all have the same tz + if seen_datetimetz: + if len(set([ getattr(val, 'tz', None) for val in objects ])) == 1: + from pandas import DatetimeIndex + return DatetimeIndex(objects) + seen_object = 1 + if not seen_object: if not safe: diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py index d883363812ddb..52e8697abe850 100644 --- a/pandas/tests/frame/test_quantile.py +++ b/pandas/tests/frame/test_quantile.py @@ -28,9 +28,12 @@ def test_quantile(self): q = self.tsframe.quantile(0.1, axis=0) self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + tm.assert_index_equal(q.index, self.tsframe.columns) + q = self.tsframe.quantile(0.9, axis=1) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + self.assertEqual(q['2000-01-17'], + percentile(self.tsframe.loc['2000-01-17'], 90)) + tm.assert_index_equal(q.index, self.tsframe.index) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) @@ -39,13 +42,13 @@ def test_quantile(self): # non-numeric exclusion df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) rs = df.quantile(0.5) - xp = df.median() + xp = df.median().rename(0.5) assert_series_equal(rs, xp) # axis df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile([.5, .75], axis=1) @@ -59,9 +62,25 @@ def test_quantile(self): df = DataFrame([[1, 2, 3], ['a', 'b', 4]]) result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1]) + expected = Series([3., 4.], index=[0, 1], name=0.5) assert_series_equal(result, expected) + def test_quantile_axis_mixed(self): + + # mixed on axis=1 + df = DataFrame({"A": [1, 2, 3], + "B": [2., 3., 4.], + "C": pd.date_range('20130101', periods=3), + "D": ['foo', 'bar', 'baz']}) + result = df.quantile(.5, axis=1) + expected = Series([1.5, 2.5, 3.5], name=0.5) + assert_series_equal(result, expected) + + # must raise + def f(): + df.quantile(.5, axis=1, numeric_only=False) + self.assertRaises(TypeError, f) + def test_quantile_axis_parameter(self): # GH 9543/9544 @@ -69,7 +88,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=0) - expected = Series([2., 3.], index=["A", "B"]) + expected = Series([2., 3.], index=["A", "B"], name=0.5) assert_series_equal(result, expected) expected = df.quantile(.5, axis="index") @@ -77,7 +96,7 @@ def test_quantile_axis_parameter(self): result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) result = df.quantile(.5, axis="columns") @@ -107,22 +126,23 @@ def test_quantile_interpolation(self): # interpolation method other than default linear df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1, 2, 3], index=[1, 2, 3]) + expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) + # cross-check interpolation=nearest results in original dtype exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='int64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64') assert_series_equal(result, expected) # float df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3]) + expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5) assert_series_equal(result, expected) exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, axis=0, interpolation='nearest') - expected = Series(exp, index=[1, 2, 3], dtype='float64') + expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64') assert_series_equal(result, expected) # axis @@ -217,7 +237,8 @@ def test_quantile_datetime(self): # datetime result = df.quantile(.5, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b']) + index=['a', 'b'], + name=0.5) assert_series_equal(result, expected) # datetime w/ multi @@ -231,7 +252,8 @@ def test_quantile_datetime(self): result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) expected = Series([Timestamp('2010-07-02 12:00:00'), Timestamp('2011-07-02 12:00:00')], - index=[0, 1]) + index=[0, 1], + name=0.5) assert_series_equal(result, expected) result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) @@ -256,12 +278,13 @@ def test_quantile_box(self): 'C': [pd.Timedelta('1 days'), pd.Timedelta('2 days'), pd.Timedelta('3 days')]}) + res = df.quantile(0.5, numeric_only=False) - # when squeezed, result.name is explicitly reset + exp = pd.Series([pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days')], - name=None, index=['A', 'B', 'C']) + name=0.5, index=['A', 'B', 'C']) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) @@ -305,7 +328,7 @@ def test_quantile_box(self): pd.Timestamp('2011-01-02', tz='US/Eastern'), pd.Timedelta('2 days'), pd.Timedelta('2 days')], - name=None, index=list('AaBbCc')) + name=0.5, index=list('AaBbCc')) tm.assert_series_equal(res, exp) res = df.quantile([0.5], numeric_only=False) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index f538fa4e90401..e0bff7fbd39e4 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -126,6 +126,14 @@ def test_quantile_interpolation_np_lt_1p9(self): interpolation='higher') def test_quantile_nan(self): + + # GH 13098 + s = pd.Series([1, 2, 3, 4, np.nan]) + result = s.quantile(0.5) + expected = 2.5 + self.assertEqual(result, expected) + + # all nan/empty cases = [Series([]), Series([np.nan, np.nan])] for s in cases: diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5bd5c80f18386..583b1c7aea270 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2676,7 +2676,7 @@ def f(x, q=None, axis=0): trans_expected = ts_grouped.transform(g) assert_series_equal(apply_result, agg_expected) - assert_series_equal(agg_result, agg_expected) + assert_series_equal(agg_result, agg_expected, check_names=False) assert_series_equal(trans_result, trans_expected) agg_result = ts_grouped.agg(f, q=80) @@ -2692,11 +2692,11 @@ def f(x, q=None, axis=0): apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) agg_result = df_grouped.agg(f, q=80) apply_result = df_grouped.apply(DataFrame.quantile, q=.8) - assert_frame_equal(agg_result, expected) + assert_frame_equal(agg_result, expected, check_names=False) assert_frame_equal(apply_result, expected) def test_size(self): From c9ffd7891dadd6e5590695e142f77a3476b5c4e3 Mon Sep 17 00:00:00 2001 From: dsm054 Date: Fri, 13 May 2016 10:47:06 +0200 Subject: [PATCH 09/96] DOC: Fix delim_whitespace regex typo. Minor typo in the explanation of delim_whitespace which tripped up a user on SO (although the user should probably have been using `delim_whitespace=True` directly anyhow.) Author: dsm054 Closes #13165 from dsm054/fix-delim_whitespace-regex and squashes the following commits: c8f13d2 [dsm054] DOC: Fix delim_whitespace regex typo. --- doc/source/io.rst | 2 +- pandas/io/parsers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index cc51fbd1e30ab..af8bca14e5d6f 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -99,7 +99,7 @@ delimiter : str, default ``None`` Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) - will be used as the delimiter. Equivalent to setting ``sep='\+s'``. + will be used as the delimiter. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4527df56db88..25639984e4ccf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -55,7 +55,7 @@ Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be - used as the sep. Equivalent to setting ``sep='\+s'``. If this option + used as the sep. Equivalent to setting ``sep='\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. From e5c18b4383bd49b7a6f42f9e3c299c8746b5a347 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Fri, 13 May 2016 09:14:18 -0400 Subject: [PATCH 10/96] BUG: Correct KeyError from matplotlib when processing Series yerr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #11858 Author: Gábor Lipták Closes #13114 from gliptak/yerr1 and squashes the following commits: 926329a [Gábor Lipták] Correct KeyError from matplotlib when processing Series xerr/yerr --- codecov.yml | 4 ---- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tools/plotting.py | 4 ++++ pandas/tseries/tests/test_plotting.py | 7 +++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/codecov.yml b/codecov.yml index 86e7dd55c9550..45a6040c6a50d 100644 --- a/codecov.yml +++ b/codecov.yml @@ -7,7 +7,3 @@ coverage: default: target: '50' branches: null - changes: - default: - branches: - - master diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 85209c0dfa03d..0bab6c2ff74e0 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -33,7 +33,6 @@ Other enhancements - .. _whatsnew_0182.api: API changes @@ -108,6 +107,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 808c9d22c53c8..baca8045f0cc1 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1331,6 +1331,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): x = x._mpl_repr() if is_errorbar: + if 'xerr' in kwds: + kwds['xerr'] = np.array(kwds.get('xerr')) + if 'yerr' in kwds: + kwds['yerr'] = np.array(kwds.get('yerr')) return ax.errorbar(x, y, **kwds) else: # prevent style kwarg from going to errorbar, where it is diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 9fab9c0990ef0..0284df9e58933 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -76,6 +76,13 @@ def test_frame_inferred(self): df = DataFrame(np.random.randn(len(idx), 3), index=idx) _check_plot_works(df.plot) + def test_is_error_nozeroindex(self): + # GH11858 + i = np.array([1, 2, 3]) + a = DataFrame(i, index=i) + _check_plot_works(a.plot, xerr=a) + _check_plot_works(a.plot, yerr=a) + def test_nonnumeric_exclude(self): import matplotlib.pyplot as plt From 00d4ec3e7b7fa68d5cf226f7b63a5eea23167b45 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 13 May 2016 09:20:23 -0400 Subject: [PATCH 11/96] BUG: Misc fixes for SparseSeries indexing with MI closes #13144 Author: sinhrks Closes #13163 from sinhrks/sparse_multi and squashes the following commits: eb24102 [sinhrks] BUG: Misc fixes for SparseSeries indexing with MI --- doc/source/whatsnew/v0.18.2.txt | 3 + pandas/indexes/multi.py | 4 +- pandas/sparse/series.py | 20 ++-- pandas/sparse/tests/test_format.py | 60 +++++++++++ pandas/sparse/tests/test_indexing.py | 142 ++++++++++++++++++++++++--- pandas/sparse/tests/test_series.py | 9 ++ pandas/tests/formats/test_format.py | 19 ---- 7 files changed, 214 insertions(+), 43 deletions(-) create mode 100644 pandas/sparse/tests/test_format.py diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0bab6c2ff74e0..bae8b1358826b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -106,6 +106,9 @@ Performance Improvements Bug Fixes ~~~~~~~~~ + +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 3effc9b1315e6..db2f80ae78446 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -592,7 +592,6 @@ def fillna(self, value=None, downcast=None): def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series # Label-based s = _values_from_object(series) @@ -604,7 +603,8 @@ def _try_mi(k): new_values = series._values[loc] new_index = self[loc] new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return series._constructor(new_values, index=new_index, + name=series.name).__finalize__(self) try: return self._engine.get_value(s, k) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index a783a7c596955..519068b97a010 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -5,14 +5,13 @@ # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import warnings import operator from pandas.compat.numpy import function as nv from pandas.core.common import isnull, _values_from_object, _maybe_match_name -from pandas.core.index import Index, _ensure_index +from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame from pandas.core.internals import SingleBlockManager @@ -135,7 +134,7 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if is_sparse_array: fill_value = data.fill_value else: - fill_value = nan + fill_value = np.nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: @@ -393,8 +392,10 @@ def _get_val_at(self, loc): def __getitem__(self, key): try: - return self._get_val_at(self.index.get_loc(key)) + return self.index.get_value(self, key) + except InvalidIndexError: + pass except KeyError: if isinstance(key, (int, np.integer)): return self._get_val_at(key) @@ -406,13 +407,12 @@ def __getitem__(self, key): # Could not hash item, must be array-like? pass - # is there a case where this would NOT be an ndarray? - # need to find an example, I took out the case for now - key = _values_from_object(key) - dataSlice = self.values[key] - new_index = Index(self.index.view(ndarray)[key]) - return self._constructor(dataSlice, index=new_index).__finalize__(self) + if self.index.nlevels > 1 and isinstance(key, tuple): + # to handle MultiIndex labels + key = self.index.get_loc(key) + return self._constructor(self.values[key], + index=self.index[key]).__finalize__(self) def _get_values(self, indexer): try: diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py new file mode 100644 index 0000000000000..2981e0f4af0bf --- /dev/null +++ b/pandas/sparse/tests/test_format.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np +import pandas as pd + +import pandas.util.testing as tm +from pandas.compat import (is_platform_windows, + is_platform_32bit) +from pandas.core.config import option_context + + +use_32bit_repr = is_platform_windows() or is_platform_32bit() + + +class TestSeriesFormatting(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_sparse_max_row(self): + s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" + "4 NaN\ndtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 10560 + result = repr(s) + exp = ("0 1.0\n ... \n4 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_mi_max_row(self): + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1), ('C', 2)]) + s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], + index=idx).to_sparse() + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" + "C 0 3.0\n 1 NaN\n 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + # GH 13144 + result = repr(s) + exp = ("A 0 1.0\n ... \nC 2 NaN\n" + "dtype: float64\nBlockIndex\n" + "Block locations: array([0, 3], dtype=int32)\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index ca2996941aef7..1f88d22bd8f93 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -10,9 +10,13 @@ class TestSparseSeriesIndexing(tm.TestCase): _multiprocess_can_split_ = True + def setUp(self): + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) + self.sparse = self.orig.to_sparse() + def test_getitem(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse[0], 1) self.assertTrue(np.isnan(sparse[1])) @@ -33,8 +37,9 @@ def test_getitem(self): tm.assert_sp_series_equal(result, exp) def test_getitem_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse[:2], orig[:2].to_sparse()) tm.assert_sp_series_equal(sparse[4:2], orig[4:2].to_sparse()) tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) @@ -84,8 +89,8 @@ def test_getitem_slice_fill_value(self): orig[-5:].to_sparse(fill_value=0)) def test_loc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.loc[0], 1) self.assertTrue(np.isnan(sparse.loc[1])) @@ -154,10 +159,17 @@ def test_loc_index_fill_value(self): tm.assert_sp_series_equal(result, exp) def test_loc_slice(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse tm.assert_sp_series_equal(sparse.loc[2:], orig.loc[2:].to_sparse()) + def test_loc_slice_index_fill_value(self): + orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) + sparse = orig.to_sparse(fill_value=0) + + tm.assert_sp_series_equal(sparse.loc['C':], + orig.loc['C':].to_sparse(fill_value=0)) + def test_loc_slice_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) @@ -165,8 +177,8 @@ def test_loc_slice_fill_value(self): orig.loc[2:].to_sparse(fill_value=0)) def test_iloc(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse self.assertEqual(sparse.iloc[3], 3) self.assertTrue(np.isnan(sparse.iloc[2])) @@ -234,8 +246,9 @@ def test_at_fill_value(self): self.assertEqual(sparse.at['e'], orig.at['e']) def test_iat(self): - orig = pd.Series([1, np.nan, np.nan, 3, np.nan]) - sparse = orig.to_sparse() + orig = self.orig + sparse = self.sparse + self.assertEqual(sparse.iat[0], orig.iat[0]) self.assertTrue(np.isnan(sparse.iat[1])) self.assertTrue(np.isnan(sparse.iat[2])) @@ -356,6 +369,111 @@ def test_reindex_fill_value(self): tm.assert_sp_series_equal(res, exp) +class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): + + _multiprocess_can_split_ = True + + def setUp(self): + # Mi with duplicated values + idx = pd.MultiIndex.from_tuples([('A', 0), ('A', 1), ('B', 0), + ('C', 0), ('C', 1)]) + self.orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=idx) + self.sparse = self.orig.to_sparse() + + def test_getitem_multi(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse[0], orig[0]) + self.assertTrue(np.isnan(sparse[1])) + self.assertEqual(sparse[3], orig[3]) + + tm.assert_sp_series_equal(sparse['A'], orig['A'].to_sparse()) + tm.assert_sp_series_equal(sparse['B'], orig['B'].to_sparse()) + + result = sparse[[1, 3, 4]] + exp = orig[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse[orig % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse[sparse % 2 == 1] + exp = orig[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_getitem_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse['C', 0], orig['C', 0]) + self.assertTrue(np.isnan(sparse['A', 1])) + self.assertTrue(np.isnan(sparse['B', 0])) + + def test_getitems_slice_multi(self): + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse[2:], orig[2:].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + def test_loc(self): + # need to be override to use different label + orig = self.orig + sparse = self.sparse + + tm.assert_sp_series_equal(sparse.loc['A'], + orig.loc['A'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B'], + orig.loc['B'].to_sparse()) + + result = sparse.loc[[1, 3, 4]] + exp = orig.loc[[1, 3, 4]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # exceeds the bounds + result = sparse.loc[[1, 3, 4, 5]] + exp = orig.loc[[1, 3, 4, 5]].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # dense array + result = sparse.loc[orig % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + # sparse array (actuary it coerces to normal Series) + result = sparse.loc[sparse % 2 == 1] + exp = orig.loc[orig % 2 == 1].to_sparse() + tm.assert_sp_series_equal(result, exp) + + def test_loc_multi_tuple(self): + orig = self.orig + sparse = self.sparse + + self.assertEqual(sparse.loc['C', 0], orig.loc['C', 0]) + self.assertTrue(np.isnan(sparse.loc['A', 1])) + self.assertTrue(np.isnan(sparse.loc['B', 0])) + + def test_loc_slice(self): + orig = self.orig + sparse = self.sparse + tm.assert_sp_series_equal(sparse.loc['A':], orig.loc['A':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['B':], orig.loc['B':].to_sparse()) + tm.assert_sp_series_equal(sparse.loc['C':], orig.loc['C':].to_sparse()) + + tm.assert_sp_series_equal(sparse.loc['A':'B'], + orig.loc['A':'B'].to_sparse()) + tm.assert_sp_series_equal(sparse.loc[:'B'], orig.loc[:'B'].to_sparse()) + + class TestSparseDataFrameIndexing(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 44bc51077ef3e..5cbc509b836db 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -1019,6 +1019,15 @@ def test_from_coo_nodense_index(self): check = check.dropna().to_sparse() tm.assert_sp_series_equal(ss, check) + def test_from_coo_long_repr(self): + # GH 13114 + # test it doesn't raise error. Formatting is tested in test_format + tm._skip_if_no_scipy() + import scipy.sparse + + sparse = SparseSeries.from_coo(scipy.sparse.rand(350, 18)) + repr(sparse) + def _run_test(self, ss, kwargs, check): results = ss.to_coo(**kwargs) self._check_results_to_coo(results, check) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 96770a86ff383..7a806280916f1 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -3758,25 +3758,6 @@ def test_to_string_header(self): exp = '0 0\n ..\n9 9' self.assertEqual(res, exp) - def test_sparse_max_row(self): - s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() - result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' - exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" - "4 NaN\ndtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - - with option_context("display.max_rows", 3): - # GH 10560 - result = repr(s) - exp = ("0 1.0\n ... \n4 NaN\n" - "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) - self.assertEqual(result, exp) - class TestEngFormatter(tm.TestCase): _multiprocess_can_split_ = True From 82f54bd1dd53cb031e5d801405b34f062155d823 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 13 May 2016 09:22:45 -0400 Subject: [PATCH 12/96] ENH/BUG: str.extractall doesn't support index closes #10008 Author: sinhrks Closes #13156 from sinhrks/str_extractall and squashes the following commits: ed854ef [sinhrks] ENH/BUG: str.extractall doesn't support index --- doc/source/text.rst | 13 ++++++++++- doc/source/whatsnew/v0.18.2.txt | 6 ++++++ pandas/core/strings.py | 38 +++++++++++++++++++++------------ pandas/tests/test_strings.py | 28 ++++++++++++++++++++++-- 4 files changed, 68 insertions(+), 17 deletions(-) diff --git a/doc/source/text.rst b/doc/source/text.rst index 16b16a320f75b..3822c713d7f85 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -281,7 +281,7 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], ["A", "B", "C"]) + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) s two_groups = '(?P[a-z])(?P[0-9])' s.str.extract(two_groups, expand=True) @@ -313,6 +313,17 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as extractall_result extractall_result.xs(0, level="match") +``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the +same result as a ``Series.str.extractall`` with a default index (starts from 0). + +.. versionadded:: 0.18.2 + +.. ipython:: python + + pd.Index(["a1a2", "b1", "c1"]).str.extractall(two_groups) + + pd.Series(["a1a2", "b1", "c1"]).str.extractall(two_groups) + Testing for Strings that Match or Contain a Pattern --------------------------------------------------- diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index bae8b1358826b..b86a7a81625e2 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -31,7 +31,12 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) +- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) + .. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") .. _whatsnew_0182.api: @@ -120,6 +125,7 @@ Bug Fixes +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 524c0205d7f73..5b1b8bd05af42 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin +from pandas.types import api as gt from pandas.util.decorators import Appender, deprecate_kwarg import re import pandas.lib as lib @@ -148,12 +149,10 @@ def _na_map(f, arr, na_result=np.nan, dtype=object): def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): - from pandas.core.series import Series - if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, Series): + if isinstance(arr, gt.ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) @@ -687,33 +686,42 @@ def str_extractall(arr, pat, flags=0): C 0 NaN 1 """ - from pandas import DataFrame, MultiIndex + regex = re.compile(pat, flags=flags) # the regex must contain capture groups. if regex.groups == 0: raise ValueError("pattern contains no capture groups") + + if isinstance(arr, gt.ABCIndex): + arr = arr.to_series().reset_index(drop=True) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] match_list = [] index_list = [] + is_mi = arr.index.nlevels > 1 + for subject_key, subject in arr.iteritems(): if isinstance(subject, compat.string_types): - try: - key_list = list(subject_key) - except TypeError: - key_list = [subject_key] + + if not is_mi: + subject_key = (subject_key, ) + for match_i, match_tuple in enumerate(regex.findall(subject)): - na_tuple = [ - np.NaN if group == "" else group for group in match_tuple] + na_tuple = [np.NaN if group == "" else group + for group in match_tuple] match_list.append(na_tuple) - result_key = tuple(key_list + [match_i]) + result_key = tuple(subject_key + (match_i, )) index_list.append(result_key) + if 0 < len(index_list): + from pandas import MultiIndex index = MultiIndex.from_tuples( index_list, names=arr.index.names + ["match"]) else: index = None - result = DataFrame(match_list, index, columns) + result = arr._constructor_expanddim(match_list, index=index, + columns=columns) return result @@ -1804,9 +1812,9 @@ class StringAccessorMixin(object): # string methods def _make_str_accessor(self): - from pandas.core.series import Series from pandas.core.index import Index - if (isinstance(self, Series) and + + if (isinstance(self, gt.ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): @@ -1819,6 +1827,8 @@ def _make_str_accessor(self): "values, which use np.object_ dtype in " "pandas") elif isinstance(self, Index): + # can't use ABCIndex to exclude non-str + # see scc/inferrence.pyx which can contain string values allowed_types = ('string', 'unicode', 'mixed', 'mixed-integer') if self.inferred_type not in allowed_types: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 4179949bc49a6..05525acedc245 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -982,6 +982,30 @@ def test_extractall_no_matches(self): "second"]) tm.assert_frame_equal(r, e) + def test_extractall_stringindex(self): + s = Series(["a1a2", "b1", "c1"], name='xxx') + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], + names=[None, 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + + # index should return the same result as the default index without name + # thus index.name doesn't affect to the result + for idx in [Index(["a1a2", "b1", "c1"]), + Index(["a1a2", "b1", "c1"], name='xxx')]: + + res = idx.str.extractall("[ab](?P\d)") + tm.assert_frame_equal(res, exp) + + s = Series(["a1a2", "b1", "c1"], name='s_name', + index=Index(["XX", "yy", "zz"], name='idx_name')) + res = s.str.extractall("[ab](?P\d)") + exp_idx = MultiIndex.from_tuples([("XX", 0), ("XX", 1), ("yy", 0)], + names=["idx_name", 'match']) + exp = DataFrame({'digit': ["1", "2", "1"]}, index=exp_idx) + tm.assert_frame_equal(res, exp) + def test_extractall_errors(self): # Does not make sense to use extractall with a regex that has # no capture groups. (it returns DataFrame with one column for @@ -991,8 +1015,8 @@ def test_extractall_errors(self): s.str.extractall(r'[a-z]') def test_extract_index_one_two_groups(self): - s = Series( - ['a3', 'b3', 'd4c2'], ["A3", "B3", "D4"], name='series_name') + s = Series(['a3', 'b3', 'd4c2'], index=["A3", "B3", "D4"], + name='series_name') r = s.index.str.extract(r'([A-Z])', expand=True) e = DataFrame(['A', "B", "D"]) tm.assert_frame_equal(r, e) From 01dd11109a0d1def8bc3b03d06c533817cc273f2 Mon Sep 17 00:00:00 2001 From: Sanjiv Lobo Date: Fri, 13 May 2016 19:12:43 -0400 Subject: [PATCH 13/96] DOC: Fix additional join examples in "10 Minutes to pandas" #13029 - [x] closes #13029 Author: Sanjiv Lobo Closes #13171 from Xndr7/fix-additional-join-examples-in-"10-Minutes-to-pandas"-#13029 and squashes the following commits: 633c7ff [Sanjiv Lobo] fixed docs for issue #13029 --- doc/source/10min.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/10min.rst b/doc/source/10min.rst index d51290b2a983b..54bcd76855f32 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -483,6 +483,17 @@ SQL style merges. See the :ref:`Database style joining ` right pd.merge(left, right, on='key') +Another example that can be given is: + +.. ipython:: python + + left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) + right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left + right + pd.merge(left, right, on='key') + + Append ~~~~~~ From feee089e41cc2dd5ff88e1068a5ca5595b6ff2f6 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 14 May 2016 08:00:41 -0400 Subject: [PATCH 14/96] BUG: Bug in .groupby(..).resample(..) when the same object is called multiple times closes #13174 Author: Jeff Reback Closes #13175 from jreback/resample and squashes the following commits: 56b405e [Jeff Reback] BUG: Bug in .groupby(..).resample(..) when the same object is called multiple times --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tests/test_window.py | 14 ++++++++++++++ pandas/tseries/resample.py | 3 ++- pandas/tseries/tests/test_resample.py | 19 +++++++++++++++++++ 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b86a7a81625e2..e92cb8cef4432 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -118,7 +118,7 @@ Bug Fixes - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - +- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22ac583a3b808..a043e92bd2c76 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2835,6 +2835,20 @@ def test_getitem(self): result = self.frame.B.groupby(self.frame.A).rolling(2).mean() assert_series_equal(result, expected) + def test_getitem_multiple(self): + + # GH 13174 + g = self.frame.groupby('A') + r = g.rolling(2) + g_mutated = self.frame.groupby('A', mutated=True) + expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + + result = r.B.count() + assert_series_equal(result, expected) + + result = r.B.count() + assert_series_equal(result, expected) + def test_rolling(self): g = self.frame.groupby('A') r = g.rolling(window=4) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index a0f08a93a07d9..bb7915e978c3e 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -1,6 +1,7 @@ from datetime import timedelta import numpy as np import warnings +import copy import pandas as pd from pandas.core.base import AbstractMethodError, GroupByMixin @@ -592,7 +593,7 @@ def __init__(self, obj, *args, **kwargs): self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True - self.groupby = parent.groupby + self.groupby = copy.copy(parent.groupby) def _apply(self, f, **kwargs): """ diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 77396c3e38c93..5dd2368db2cb8 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2519,6 +2519,25 @@ def test_getitem(self): result = g.resample('2s').mean().B assert_series_equal(result, expected) + def test_getitem_multiple(self): + + # GH 13174 + # multiple calls after selection causing an issue with aliasing + data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}] + df = pd.DataFrame(data, index=pd.date_range('2016-01-01', periods=2)) + r = df.groupby('id').resample('1D') + result = r['buyer'].count() + expected = pd.Series([1, 1], + index=pd.MultiIndex.from_tuples( + [(1, pd.Timestamp('2016-01-01')), + (2, pd.Timestamp('2016-01-02'))], + names=['id', None]), + name='buyer') + assert_series_equal(result, expected) + + result = r['buyer'].count() + assert_series_equal(result, expected) + def test_methods(self): g = self.frame.groupby('A') r = g.resample('2s') From b38579999f7385cf3be59d6be7f3bb40990d12d1 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 14 May 2016 08:01:54 -0400 Subject: [PATCH 15/96] DOC: Clarify Categorical Crosstab Behaviour Follow-on to #13073 by explaining the `Categorical` behaviour in the documentation. Author: gfyoung Closes #13177 from gfyoung/crosstab-categorical-explain and squashes the following commits: 11ebb94 [gfyoung] DOC: Clarify Categorical Crosstab Behaviour --- doc/source/reshaping.rst | 10 ++++++++++ pandas/tools/pivot.py | 16 +++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 21765b3f621ce..9ed2c42610b69 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -445,6 +445,16 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. pd.crosstab(df.A, df.B) +Any input passed containing ``Categorical`` data will have **all** of its +categories included in the cross-tabulation, even if the actual data does +not contain any instances of a particular category. + +.. ipython:: python + + foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + pd.crosstab(foo, bar) + Normalization ~~~~~~~~~~~~~ diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index de79e54e22270..a4e6cc404a457 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -410,7 +410,11 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Notes ----- Any Series passed will have their name attributes used unless row or column - names for the cross-tabulation are specified + names for the cross-tabulation are specified. + + Any input passed containing Categorical data will have **all** of its + categories included in the cross-tabulation, even if the actual data does + not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. @@ -434,6 +438,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, bar 1 2 1 0 foo 2 2 1 2 + >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, + # but they still will be counted in the output + col_0 d e f + row_0 + a 1 0 0 + b 0 1 0 + c 0 0 0 + Returns ------- crosstab : DataFrame From 2de2884a7e7abf64f9967f6d8bc05a2d45f59bb4 Mon Sep 17 00:00:00 2001 From: Ka Wo Chen Date: Sat, 14 May 2016 08:02:48 -0400 Subject: [PATCH 16/96] BUG: GH12896 where extra elements are returned in MultiIndex slicing closes #12896 Author: Ka Wo Chen Closes #13117 from kawochen/BUG-FIX-12896 and squashes the following commits: 7d49346 [Ka Wo Chen] BUG: GH12896 where extra elements are returned in MultiIndex slicing --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/indexes/multi.py | 3 ++- pandas/tests/indexing/test_indexing.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index e92cb8cef4432..3ac466158276f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -134,7 +134,7 @@ Bug Fixes - +- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index db2f80ae78446..6f3360cdf82a7 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1761,7 +1761,8 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: m = np.zeros(len(labels), dtype=bool) - m[np.in1d(labels, r, assume_unique=True)] = True + m[np.in1d(labels, r, + assume_unique=Index(labels).is_unique)] = True return m diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 4b8b5ae2571d0..fdc9d3599e8ac 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -2334,6 +2334,18 @@ def test_multiindex_slicers_non_unique(self): self.assertFalse(result.index.is_unique) assert_frame_equal(result, expected) + # GH12896 + # numpy-implementation dependent bug + ints = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13, 14, 14, 16, + 17, 18, 19, 200000, 200000] + n = len(ints) + idx = MultiIndex.from_arrays([['a'] * n, ints]) + result = Series([1] * n, index=idx) + result = result.sort_index() + result = result.loc[(slice(None), slice(100000))] + expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() + assert_series_equal(result, expected) + def test_multiindex_slicers_datetimelike(self): # GH 7429 From f637aa31cf8e99796b383324aff474a87d19d222 Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Sun, 15 May 2016 14:29:01 -0400 Subject: [PATCH 17/96] TST: Use compatible time zones xref #13186 closes #13190 --- pandas/tests/series/test_timeseries.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index ee06bc2c3dd4e..de62fb4ab6f07 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -346,8 +346,10 @@ def test_getitem_setitem_datetime_tz_dateutil(self): from pandas import date_range N = 50 + # testing with timezone, GH #2785 - rng = date_range('1/1/1990', periods=N, freq='H', tz='US/Eastern') + rng = date_range('1/1/1990', periods=N, freq='H', + tz='America/New_York') ts = Series(np.random.randn(N), index=rng) # also test Timestamp tz handling, GH #2789 @@ -368,8 +370,8 @@ def test_getitem_setitem_datetime_tz_dateutil(self): assert_series_equal(result, ts) result = ts.copy() - result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = 0 - result[datetime(1990, 1, 1, 3, tzinfo=tz('US/Central'))] = ts[4] + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = 0 + result[datetime(1990, 1, 1, 3, tzinfo=tz('America/Chicago'))] = ts[4] assert_series_equal(result, ts) def test_getitem_setitem_periodindex(self): From 62bed0e33397132bd4340c8da54c3feeb22e5083 Mon Sep 17 00:00:00 2001 From: John Evans Date: Mon, 16 May 2016 08:06:51 -0400 Subject: [PATCH 18/96] COMPAT: Add Pathlib, py.path support for read_hdf Closes #11773 Author: John Evans Closes #12930 from quintusdias/issue11773 and squashes the following commits: dcee282 [John Evans] COMPAT: Add Pathlib, py.path support for read_hdf, to_hdf --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/pytables.py | 11 +++++++++- pandas/io/tests/test_pytables.py | 36 ++++++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 3ac466158276f..459bdbf10a4f1 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -32,6 +32,7 @@ Other enhancements - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) +- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 318fd17b8f88e..d350358081aa7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -13,10 +13,12 @@ import os import numpy as np + import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index) from pandas.core import config +from pandas.io.common import _stringify_path from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex @@ -254,6 +256,7 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, else: f = lambda store: store.put(key, value, **kwargs) + path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, string_types): with HDFStore(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: @@ -270,7 +273,11 @@ def read_hdf(path_or_buf, key=None, **kwargs): Parameters ---------- - path_or_buf : path (string), or buffer to read from + path_or_buf : path (string), buffer, or path object (pathlib.Path or + py._path.local.LocalPath) to read from + + .. versionadded:: 0.18.2 support for pathlib, py.path. + key : group identifier in the store. Can be omitted a HDF file contains a single pandas object. where : list of Term (or convertable) objects, optional @@ -293,6 +300,7 @@ def read_hdf(path_or_buf, key=None, **kwargs): if 'where' in kwargs: kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) + path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, string_types): try: @@ -316,6 +324,7 @@ def read_hdf(path_or_buf, key=None, **kwargs): store = path_or_buf auto_close = False + else: raise NotImplementedError('Support for generic buffers has not been ' 'implemented.') diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d21189fe91a2a..6bf0175526424 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4836,6 +4836,42 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): + + # GH11773 + tm._skip_if_no_pathlib() + + from pathlib import Path + + expected = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + with ensure_clean_path(self.path) as filename: + path_obj = Path(filename) + + expected.to_hdf(path_obj, 'df', mode='a') + actual = read_hdf(path_obj, 'df') + + tm.assert_frame_equal(expected, actual) + + def test_read_from_py_localpath(self): + + # GH11773 + tm._skip_if_no_localpath() + + from py.path import local as LocalPath + + expected = DataFrame(np.random.rand(4, 5), + index=list('abcd'), + columns=list('ABCDE')) + with ensure_clean_path(self.path) as filename: + path_obj = LocalPath(filename) + + expected.to_hdf(path_obj, 'df', mode='a') + actual = read_hdf(path_obj, 'df') + + tm.assert_frame_equal(expected, actual) + class TestHDFComplexValues(Base): # GH10447 From 4e4a7d9ae5b3c05b21dc0a0f3c12648236ec586e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 16 May 2016 08:25:53 -0400 Subject: [PATCH 19/96] COMPAT/TST: sparse formatting test for platform, xref #13163 --- pandas/sparse/tests/test_format.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py index 2981e0f4af0bf..9bdc1fdd101ea 100644 --- a/pandas/sparse/tests/test_format.py +++ b/pandas/sparse/tests/test_format.py @@ -17,14 +17,18 @@ class TestSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True + @property + def dtype_format_for_platform(self): + return '' if use_32bit_repr else ', dtype=int32' + def test_sparse_max_row(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]).to_sparse() result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' + dfm = self.dtype_format_for_platform exp = ("0 1.0\n1 NaN\n2 NaN\n3 3.0\n" "4 NaN\ndtype: float64\nBlockIndex\n" "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) with option_context("display.max_rows", 3): @@ -33,7 +37,7 @@ def test_sparse_max_row(self): exp = ("0 1.0\n ... \n4 NaN\n" "dtype: float64\nBlockIndex\n" "Block locations: array([0, 3]{0})\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) def test_sparse_mi_max_row(self): @@ -42,12 +46,12 @@ def test_sparse_mi_max_row(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan, np.nan], index=idx).to_sparse() result = repr(s) - dtype = '' if use_32bit_repr else ', dtype=int32' + dfm = self.dtype_format_for_platform exp = ("A 0 1.0\n 1 NaN\nB 0 NaN\n" "C 0 3.0\n 1 NaN\n 2 NaN\n" "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3], dtype=int32)\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) with option_context("display.max_rows", 3): @@ -55,6 +59,6 @@ def test_sparse_mi_max_row(self): result = repr(s) exp = ("A 0 1.0\n ... \nC 2 NaN\n" "dtype: float64\nBlockIndex\n" - "Block locations: array([0, 3], dtype=int32)\n" - "Block lengths: array([1, 1]{0})".format(dtype)) + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) From 62fc4818e2bb5a13cddf1929950975913af27bb5 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Tue, 17 May 2016 09:22:06 -0400 Subject: [PATCH 20/96] CLN: no return on init Author: Maximilian Roos Closes #13197 from MaximilianR/init-return and squashes the following commits: ee5072a [Maximilian Roos] formatting --- pandas/core/window.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/window.py b/pandas/core/window.py index b1be66bee9bc8..eb0d996436661 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -985,10 +985,8 @@ class Expanding(_Rolling_and_Expanding): def __init__(self, obj, min_periods=1, freq=None, center=False, axis=0, **kwargs): - return super(Expanding, self).__init__(obj=obj, - min_periods=min_periods, - freq=freq, center=center, - axis=axis) + super(Expanding, self).__init__(obj=obj, min_periods=min_periods, + freq=freq, center=center, axis=axis) @property def _constructor(self): From 20ea4064b0c94f99c275bfc4217664cc8aea75c5 Mon Sep 17 00:00:00 2001 From: zhangjinjie Date: Tue, 17 May 2016 09:39:37 -0400 Subject: [PATCH 21/96] BUG: fix to_records confict with unicode_literals #13172 closes #13172 Author: zhangjinjie Closes #13178 from starplanet/bugfix-13172 and squashes the following commits: 62b00d3 [zhangjinjie] BUG: fix to_records confict with unicode_literals #13172 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/frame.py | 2 +- pandas/tests/frame/test_convert_to.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 459bdbf10a4f1..6db628417f38e 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -120,6 +120,7 @@ Bug Fixes - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) +- Bug in ``.to_records()`` when index name is a unicode string (:issue: `13172`) - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3bf442349ef04..b3d01d12c9336 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1062,7 +1062,7 @@ def to_records(self, index=True, convert_datetime64=True): count += 1 elif index_names[0] is None: index_names = ['index'] - names = index_names + lmap(str, self.columns) + names = lmap(str, index_names) + lmap(str, self.columns) else: arrays = [self[c].get_values() for c in self.columns] names = lmap(str, self.columns) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 8bb253e17fd06..4e65ee09746b8 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -172,3 +172,11 @@ def test_to_records_index_name(self): df.index.names = ['A', None] rs = df.to_records() self.assertIn('level_0', rs.dtype.fields) + + def test_to_records_with_unicode_index(self): + # GH13172 + # unicode_literals conflict with to_records + result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\ + .to_records() + expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) + tm.assert_numpy_array_equal(result, expected) From 00e0f3eb039d2ee8f87a7cdb3c33440e7638de80 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 17 May 2016 13:20:17 -0400 Subject: [PATCH 22/96] BUG: Period and Series/Index comparison raises TypeError Author: sinhrks Closes #13200 from sinhrks/period_comp and squashes the following commits: aadf669 [sinhrks] BUG: Period and Series/Index comparison raises TypeError --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/src/period.pyx | 3 + pandas/tseries/tests/test_period.py | 131 ++++++++++++++++++++++++++-- 3 files changed, 129 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 6db628417f38e..5b72afe53e30e 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -149,4 +149,5 @@ Bug Fixes - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 0cb0b575b25dc..670fe1e4f168c 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -772,6 +772,9 @@ cdef class Period(object): if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: return _nat_scalar_rules[op] return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) + # index/series like + elif hasattr(other, '_typ'): + return NotImplemented else: if op == Py_EQ: return NotImplemented diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 4217cc9a299a3..db1572a49a9ff 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -3950,24 +3950,30 @@ def test_pi_pi_comp(self): exp = np.array([False, True, False, False]) self.assert_numpy_array_equal(base == p, exp) + self.assert_numpy_array_equal(p == base, exp) exp = np.array([True, False, True, True]) self.assert_numpy_array_equal(base != p, exp) + self.assert_numpy_array_equal(p != base, exp) exp = np.array([False, False, True, True]) self.assert_numpy_array_equal(base > p, exp) + self.assert_numpy_array_equal(p < base, exp) exp = np.array([True, False, False, False]) self.assert_numpy_array_equal(base < p, exp) + self.assert_numpy_array_equal(p > base, exp) exp = np.array([False, True, True, True]) self.assert_numpy_array_equal(base >= p, exp) + self.assert_numpy_array_equal(p <= base, exp) exp = np.array([True, True, False, False]) self.assert_numpy_array_equal(base <= p, exp) + self.assert_numpy_array_equal(p >= base, exp) - idx = PeriodIndex( - ['2011-02', '2011-01', '2011-03', '2011-05'], freq=freq) + idx = PeriodIndex(['2011-02', '2011-01', '2011-03', + '2011-05'], freq=freq) exp = np.array([False, False, True, False]) self.assert_numpy_array_equal(base == idx, exp) @@ -3992,6 +3998,9 @@ def test_pi_pi_comp(self): with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): base <= Period('2011', freq='A') + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + with tm.assertRaisesRegexp(ValueError, msg): idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') base <= idx @@ -4001,6 +4010,9 @@ def test_pi_pi_comp(self): with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): base <= Period('2011', freq='4M') + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='4M') >= base + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='4M') base <= idx @@ -4013,17 +4025,23 @@ def test_pi_nat_comp(self): result = idx1 > Period('2011-02', freq=freq) exp = np.array([False, False, False, True]) self.assert_numpy_array_equal(result, exp) + result = Period('2011-02', freq=freq) < idx1 + self.assert_numpy_array_equal(result, exp) result = idx1 == Period('NaT', freq=freq) exp = np.array([False, False, False, False]) self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) == idx1 + self.assert_numpy_array_equal(result, exp) result = idx1 != Period('NaT', freq=freq) exp = np.array([True, True, True, True]) self.assert_numpy_array_equal(result, exp) + result = Period('NaT', freq=freq) != idx1 + self.assert_numpy_array_equal(result, exp) - idx2 = PeriodIndex( - ['2011-02', '2011-01', '2011-04', 'NaT'], freq=freq) + idx2 = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq=freq) result = idx1 < idx2 exp = np.array([True, False, False, False]) self.assert_numpy_array_equal(result, exp) @@ -4044,11 +4062,12 @@ def test_pi_nat_comp(self): exp = np.array([False, False, True, False]) self.assert_numpy_array_equal(result, exp) - diff = PeriodIndex( - ['2011-02', '2011-01', '2011-04', 'NaT'], freq='4M') + diff = PeriodIndex(['2011-02', '2011-01', '2011-04', + 'NaT'], freq='4M') msg = "Input has different freq=4M from PeriodIndex" with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx1 > diff + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx1 == diff @@ -4185,6 +4204,106 @@ def test_ops_series_period(self): tm.assert_series_equal(s2 - s, exp) tm.assert_series_equal(s - s2, -exp) + def test_comp_series_period_scalar(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + p = Period('2011-02', freq=freq) + + exp = pd.Series([False, True, False, False]) + tm.assert_series_equal(base == p, exp) + tm.assert_series_equal(p == base, exp) + + exp = pd.Series([True, False, True, True]) + tm.assert_series_equal(base != p, exp) + tm.assert_series_equal(p != base, exp) + + exp = pd.Series([False, False, True, True]) + tm.assert_series_equal(base > p, exp) + tm.assert_series_equal(p < base, exp) + + exp = pd.Series([True, False, False, False]) + tm.assert_series_equal(base < p, exp) + tm.assert_series_equal(p > base, exp) + + exp = pd.Series([False, True, True, True]) + tm.assert_series_equal(base >= p, exp) + tm.assert_series_equal(p <= base, exp) + + exp = pd.Series([True, True, False, False]) + tm.assert_series_equal(base <= p, exp) + tm.assert_series_equal(p >= base, exp) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= Period('2011', freq='A') + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + Period('2011', freq='A') >= base + + def test_comp_series_period_series(self): + # GH 13200 + for freq in ['M', '2M', '3M']: + base = Series([Period(x, freq=freq) for x in + ['2011-01', '2011-02', '2011-03', '2011-04']]) + + s = Series([Period(x, freq=freq) for x in + ['2011-02', '2011-01', '2011-03', '2011-05']]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + + s2 = Series([Period(x, freq='A') for x in + ['2011', '2011', '2011', '2011']]) + + # different base freq + msg = "Input has different freq=A-DEC from Period" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + base <= s2 + + def test_comp_series_period_object(self): + # GH 13200 + base = Series([Period('2011', freq='A'), Period('2011-02', freq='M'), + Period('2013', freq='A'), Period('2011-04', freq='M')]) + + s = Series([Period('2012', freq='A'), Period('2011-01', freq='M'), + Period('2013', freq='A'), Period('2011-05', freq='M')]) + + exp = Series([False, False, True, False]) + tm.assert_series_equal(base == s, exp) + + exp = Series([True, True, False, True]) + tm.assert_series_equal(base != s, exp) + + exp = Series([False, True, False, False]) + tm.assert_series_equal(base > s, exp) + + exp = Series([True, False, False, True]) + tm.assert_series_equal(base < s, exp) + + exp = Series([False, True, True, False]) + tm.assert_series_equal(base >= s, exp) + + exp = Series([True, False, True, True]) + tm.assert_series_equal(base <= s, exp) + def test_ops_frame_period(self): # GH 13043 df = pd.DataFrame({'A': [pd.Period('2015-01', freq='M'), From 2429ec5a3be278a2fb444eb2aae0c9b05b05cb62 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 17 May 2016 13:24:14 -0400 Subject: [PATCH 23/96] TST: change test comparison to work on older numpies, #13178 --- pandas/tests/frame/test_convert_to.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 4e65ee09746b8..cf35372319c85 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -179,4 +179,4 @@ def test_to_records_with_unicode_index(self): result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\ .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) - tm.assert_numpy_array_equal(result, expected) + tm.assert_almost_equal(result, expected) From 009d1df85ec6e6f80cace1d949bb7cdc8d35df7c Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 18 May 2016 09:17:22 -0400 Subject: [PATCH 24/96] PERF: DataFrame transform closes #12737 closes #13191 Author: Chris Closes #13192 from chris-b1/transform-perf and squashes the following commits: 0af1e55 [Chris] revert casting logic d61d4e0 [Chris] handle duplicate column case 9d78f65 [Chris] other categorical test name fix 045d0c7 [Chris] add back some casting b66a1c8 [Chris] PERF: DataFrame transform --- asv_bench/benchmarks/groupby.py | 15 ++++++++++ doc/source/whatsnew/v0.18.2.txt | 4 +-- pandas/core/groupby.py | 48 +++++++++++++++++--------------- pandas/tests/test_categorical.py | 6 ++-- pandas/tests/test_groupby.py | 40 ++++++++++++++++++++++++-- 5 files changed, 82 insertions(+), 31 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 7279d73eb0d97..586bd00b091fe 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -773,6 +773,21 @@ def setup(self): def time_groupby_transform_series2(self): self.df.groupby('id')['val'].transform(np.mean) + +class groupby_transform_dataframe(object): + # GH 12737 + goal_time = 0.2 + + def setup(self): + self.df = pd.DataFrame({'group': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df.ix[4::10, 'B':'C'] = 5 + + def time_groupby_transform_dataframe(self): + self.df.groupby('group').transform('first') + + class groupby_transform_cythonized(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 5b72afe53e30e..36fdbe8c66860 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -105,7 +105,7 @@ Performance Improvements - increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - +- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) .. _whatsnew_0182.bug_fixes: @@ -125,7 +125,7 @@ Bug Fixes - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - +- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7a4791189726e..424859da82877 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2776,18 +2776,11 @@ def _transform_fast(self, func): func = getattr(self, func) ids, _, ngroup = self.grouper.group_info - mask = ids != -1 - - out = func().values[ids] - if not mask.all(): - out = np.where(mask, out, np.nan) - - obs = np.zeros(ngroup, dtype='bool') - obs[ids[mask]] = True - if not obs.all(): - out = self._try_cast(out, self._selected_obj) - - return Series(out, index=self.obj.index) + cast = (self.size().fillna(0) > 0).any() + out = algos.take_1d(func().values, ids) + if cast: + out = self._try_cast(out, self.obj) + return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): # noqa """ @@ -3465,19 +3458,28 @@ def transform(self, func, *args, **kwargs): if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) - results = np.empty_like(obj.values, result.values.dtype) - for (name, group), (i, row) in zip(self, result.iterrows()): - indexer = self._get_index(name) - if len(indexer) > 0: - results[indexer] = np.tile(row.values, len( - indexer)).reshape(len(indexer), -1) + return self._transform_fast(result, obj) - counts = self.size().fillna(0).values - if any(counts == 0): - results = self._try_cast(results, obj[result.columns]) + def _transform_fast(self, result, obj): + """ + Fast transform path for aggregations + """ + # if there were groups with no observations (Categorical only?) + # try casting data to original dtype + cast = (self.size().fillna(0) > 0).any() - return (DataFrame(results, columns=result.columns, index=obj.index) - ._convert(datetime=True)) + # for each col, reshape to to size of original frame + # by take operation + ids, _, ngroup = self.grouper.group_info + output = [] + for i, _ in enumerate(result.columns): + res = algos.take_1d(result.iloc[:, i].values, ids) + if cast: + res = self._try_cast(res, obj.iloc[:, i]) + output.append(res) + + return DataFrame._from_arrays(output, columns=result.columns, + index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 55df64264d6f9..5a6667e57ce9d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3025,8 +3025,7 @@ def f(x): c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) @@ -3043,8 +3042,7 @@ def f(x): c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) - tm.assert_series_equal(result, df['a'], check_names=False) - self.assertTrue(result.name is None) + tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 583b1c7aea270..d15bab708b61f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1051,13 +1051,39 @@ def test_transform_fast(self): values = np.repeat(grp.mean().values, com._ensure_platform_int(grp.count().values)) - expected = pd.Series(values, index=df.index) + expected = pd.Series(values, index=df.index, name='val') result = grp.transform(np.mean) assert_series_equal(result, expected) result = grp.transform('mean') assert_series_equal(result, expected) + # GH 12737 + df = pd.DataFrame({'grouping': [0, 1, 1, 3], 'f': [1.1, 2.1, 3.1, 4.5], + 'd': pd.date_range('2014-1-1', '2014-1-4'), + 'i': [1, 2, 3, 4]}, + columns=['grouping', 'f', 'i', 'd']) + result = df.groupby('grouping').transform('first') + + dates = [pd.Timestamp('2014-1-1'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-4')] + expected = pd.DataFrame({'f': [1.1, 2.1, 2.1, 4.5], + 'd': dates, + 'i': [1, 2, 2, 4]}, + columns=['f', 'i', 'd']) + assert_frame_equal(result, expected) + + # selection + result = df.groupby('grouping')[['f', 'i']].transform('first') + expected = expected[['f', 'i']] + assert_frame_equal(result, expected) + + # dup columns + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['g', 'a', 'a']) + result = df.groupby('g').transform('first') + expected = df.drop('g', axis=1) + assert_frame_equal(result, expected) + def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) @@ -1191,6 +1217,16 @@ def test_transform_function_aliases(self): expected = self.df.groupby('A')['C'].transform(np.mean) assert_series_equal(result, expected) + def test_series_fast_transform_date(self): + # GH 13191 + df = pd.DataFrame({'grouping': [np.nan, 1, 1, 3], + 'd': pd.date_range('2014-1-1', '2014-1-4')}) + result = df.groupby('grouping')['d'].transform('first') + dates = [pd.NaT, pd.Timestamp('2014-1-2'), pd.Timestamp('2014-1-2'), + pd.Timestamp('2014-1-4')] + expected = pd.Series(dates, name='d') + assert_series_equal(result, expected) + def test_transform_length(self): # GH 9697 df = pd.DataFrame({'col1': [1, 1, 2, 2], 'col2': [1, 2, 3, np.nan]}) @@ -4406,7 +4442,7 @@ def test_groupby_datetime64_32_bit(self): df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2}) result = df.groupby("A")["B"].transform(min) - expected = Series([pd.Timestamp('2000-01-1')] * 2) + expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') assert_series_equal(result, expected) def test_groupby_categorical_unequal_len(self): From 86f68e6a48bc0219493f093e4224fe772f24ecac Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 18 May 2016 09:20:48 -0400 Subject: [PATCH 25/96] BUG: Sparse creation with object dtype may raise TypeError closes #11633 closes #11856 Author: sinhrks Closes #13201 from sinhrks/sparse_isnull and squashes the following commits: 443b47e [sinhrks] BUG: Sparse creation with object dtype may raise TypeError --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/sparse/array.py | 20 +++++++---- pandas/sparse/tests/test_array.py | 11 ++++++ pandas/sparse/tests/test_groupby.py | 46 +++++++++++++++++++++++++ pandas/sparse/tests/test_pivot.py | 52 +++++++++++++++++++++++++++++ pandas/tests/test_groupby.py | 4 +-- 6 files changed, 126 insertions(+), 8 deletions(-) create mode 100644 pandas/sparse/tests/test_groupby.py create mode 100644 pandas/sparse/tests/test_pivot.py diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 36fdbe8c66860..485591b9357ea 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -116,6 +116,7 @@ Bug Fixes - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index e114bee87ca27..0312fb023f7fd 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -152,9 +152,17 @@ def __new__(cls, data, sparse_index=None, index=None, kind='integer', # Create array, do *not* copy data by default if copy: - subarr = np.array(values, dtype=dtype, copy=True) + try: + # ToDo: Can remove this error handling when we actually + # support other dtypes + subarr = np.array(values, dtype=dtype, copy=True) + except ValueError: + subarr = np.array(values, copy=True) else: - subarr = np.asarray(values, dtype=dtype) + try: + subarr = np.asarray(values, dtype=dtype) + except ValueError: + subarr = np.asarray(values) # if we have a bool type, make sure that we have a bool fill_value if ((dtype is not None and issubclass(dtype.type, np.bool_)) or @@ -437,12 +445,12 @@ def count(self): @property def _null_fill_value(self): - return np.isnan(self.fill_value) + return com.isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = np.isfinite(sp_vals) + mask = com.notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -616,8 +624,8 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if np.isnan(fill_value): - mask = ~np.isnan(arr) + if com.isnull(fill_value): + mask = com.notnull(arr) else: mask = arr != fill_value diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index 26d018c56a8a8..dd2126d0f52d2 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -46,6 +46,17 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) + def test_constructor_object_dtype(self): + # GH 11856 + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object) + self.assertEqual(arr.dtype, np.object) + self.assertTrue(np.isnan(arr.fill_value)) + + arr = SparseArray(['A', 'A', np.nan, 'B'], dtype=np.object, + fill_value='A') + self.assertEqual(arr.dtype, np.object) + self.assertEqual(arr.fill_value, 'A') + def test_constructor_spindex_dtype(self): arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2])) tm.assert_sp_array_equal(arr, SparseArray([np.nan, 1, 2, np.nan])) diff --git a/pandas/sparse/tests/test_groupby.py b/pandas/sparse/tests/test_groupby.py new file mode 100644 index 0000000000000..0cb33f4ea0a56 --- /dev/null +++ b/pandas/sparse/tests/test_groupby.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestSparseGroupBy(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_first_last_nth(self): + # tests for first / last / nth + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.first(), + dense_grouped.first()) + tm.assert_frame_equal(sparse_grouped.last(), + dense_grouped.last()) + tm.assert_frame_equal(sparse_grouped.nth(1), + dense_grouped.nth(1)) + + def test_aggfuncs(self): + sparse_grouped = self.sparse.groupby('A') + dense_grouped = self.dense.groupby('A') + + tm.assert_frame_equal(sparse_grouped.mean(), + dense_grouped.mean()) + + # ToDo: sparse sum includes str column + # tm.assert_frame_equal(sparse_grouped.sum(), + # dense_grouped.sum()) + + tm.assert_frame_equal(sparse_grouped.count(), + dense_grouped.count()) diff --git a/pandas/sparse/tests/test_pivot.py b/pandas/sparse/tests/test_pivot.py new file mode 100644 index 0000000000000..482a99a96194f --- /dev/null +++ b/pandas/sparse/tests/test_pivot.py @@ -0,0 +1,52 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestPivotTable(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.dense = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': [np.nan, np.nan, 1, 2, + np.nan, 1, np.nan, np.nan]}) + self.sparse = self.dense.to_sparse() + + def test_pivot_table(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='C') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='C') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E') + tm.assert_frame_equal(res_sparse, res_dense) + + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values='E', aggfunc='mean') + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values='E', aggfunc='mean') + tm.assert_frame_equal(res_sparse, res_dense) + + # ToDo: sum doesn't handle nan properly + # res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + # values='E', aggfunc='sum') + # res_dense = pd.pivot_table(self.dense, index='A', columns='B', + # values='E', aggfunc='sum') + # tm.assert_frame_equal(res_sparse, res_dense) + + def test_pivot_table_multi(self): + res_sparse = pd.pivot_table(self.sparse, index='A', columns='B', + values=['D', 'E']) + res_dense = pd.pivot_table(self.dense, index='A', columns='B', + values=['D', 'E']) + tm.assert_frame_equal(res_sparse, res_dense) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index d15bab708b61f..571b0fa1ee78f 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -4544,7 +4544,7 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - def test_aaa_groupby_with_small_elem(self): + def test_groupby_with_small_elem(self): # GH 8542 # length=2 df = pd.DataFrame({'event': ['start', 'start'], @@ -6008,7 +6008,7 @@ def test__cython_agg_general(self): exc.args += ('operation: %s' % op, ) raise - def test_aa_cython_group_transform_algos(self): + def test_cython_group_transform_algos(self): # GH 4095 dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint32, np.uint64, np.float32, np.float64] From 4b501497a88c118c2bfdcdbc4a5b216b68b1b88c Mon Sep 17 00:00:00 2001 From: Alex Alekseyev Date: Wed, 18 May 2016 18:08:39 -0400 Subject: [PATCH 26/96] TST: Test resampling with NaT closes #13020 Author: Alex Alekseyev Closes #13164 from evectant/issue-13020 and squashes the following commits: c7e6f5f [Alex Alekseyev] TST: Test resampling with NaT --- pandas/tseries/tests/test_resample.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 5dd2368db2cb8..dd5ab36d10a45 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1846,6 +1846,32 @@ def test_resmaple_dst_anchor(self): freq='D', tz='Europe/Paris')), 'D Frequency') + def test_resample_with_nat(self): + # GH 13020 + index = DatetimeIndex([pd.NaT, + '1970-01-01 00:00:00', + pd.NaT, + '1970-01-01 00:00:01', + '1970-01-01 00:00:02']) + frame = DataFrame([2, 3, 5, 7, 11], index=index) + + index_1s = DatetimeIndex(['1970-01-01 00:00:00', + '1970-01-01 00:00:01', + '1970-01-01 00:00:02']) + frame_1s = DataFrame([3, 7, 11], index=index_1s) + assert_frame_equal(frame.resample('1s').mean(), frame_1s) + + index_2s = DatetimeIndex(['1970-01-01 00:00:00', + '1970-01-01 00:00:02']) + frame_2s = DataFrame([5, 11], index=index_2s) + assert_frame_equal(frame.resample('2s').mean(), frame_2s) + + index_3s = DatetimeIndex(['1970-01-01 00:00:00']) + frame_3s = DataFrame([7], index=index_3s) + assert_frame_equal(frame.resample('3s').mean(), frame_3s) + + assert_frame_equal(frame.resample('60s').mean(), frame_3s) + class TestPeriodIndex(Base, tm.TestCase): _multiprocess_can_split_ = True From eeccd058a5199c3e4fd9900b95e00672f701b3e9 Mon Sep 17 00:00:00 2001 From: Felix Marczinowski Date: Thu, 19 May 2016 08:43:04 -0400 Subject: [PATCH 27/96] BUG: Fix #13213 json_normalize() and non-ascii characters in keys closes #13213 Author: Felix Marczinowski Closes #13214 from fmarczin/13213-unicode-json_normalize and squashes the following commits: 22e01b2 [Felix Marczinowski] fix linter warnings 44745ca [Felix Marczinowski] fix tests for py3 25fd0f8 [Felix Marczinowski] move test, fix py3 issue 7a38110 [Felix Marczinowski] add whatsnew note dd7302c [Felix Marczinowski] remove encoding signature from test 4dcd2c5 [Felix Marczinowski] fix for #13213 b9751e9 [Felix Marczinowski] add test for #13213 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/json.py | 6 ++++-- pandas/io/tests/json/test_json_norm.py | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 485591b9357ea..59ab33fff1967 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -113,6 +113,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) diff --git a/pandas/io/json.py b/pandas/io/json.py index 08bfd8d7796a0..fd97e51208f7e 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -614,10 +614,12 @@ def nested_to_record(ds, prefix="", level=0): new_d = copy.deepcopy(d) for k, v in d.items(): # each key gets renamed with prefix + if not isinstance(k, compat.string_types): + k = str(k) if level == 0: - newkey = str(k) + newkey = k else: - newkey = prefix + '.' + str(k) + newkey = prefix + '.' + k # only dicts gets recurse-flattend # only at level>1 do we rename the rest of the keys diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 81a1fecbdebac..4848db97194d9 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -2,8 +2,10 @@ from pandas import DataFrame import numpy as np +import json import pandas.util.testing as tm +from pandas import compat from pandas.io.json import json_normalize, nested_to_record @@ -164,6 +166,26 @@ def test_record_prefix(self): tm.assert_frame_equal(result, expected) + def test_non_ascii_key(self): + if compat.PY3: + testjson = ( + b'[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + + b'{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]' + ).decode('utf8') + else: + testjson = ('[{"\xc3\x9cnic\xc3\xb8de":0,"sub":{"A":1, "B":2}},' + '{"\xc3\x9cnic\xc3\xb8de":1,"sub":{"A":3, "B":4}}]') + + testdata = { + u'sub.A': [1, 3], + u'sub.B': [2, 4], + b"\xc3\x9cnic\xc3\xb8de".decode('utf8'): [0, 1] + } + expected = DataFrame(testdata) + + result = json_normalize(json.loads(testjson)) + tm.assert_frame_equal(result, expected) + class TestNestedToRecord(tm.TestCase): From 070e877ed6600a32346a7c207d339a820374abd7 Mon Sep 17 00:00:00 2001 From: Elliot Marsden Date: Thu, 19 May 2016 10:50:46 -0400 Subject: [PATCH 28/96] BUG: Fix argument order in call to super Author: Elliot Marsden Closes #12924 from eddiejessup/master and squashes the following commits: b495e32 [Elliot Marsden] BUG: Fix argument order in call to super --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/base.py | 2 +- pandas/tests/test_base.py | 63 +++++++++++++++++++-------------- 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 59ab33fff1967..61461be87801e 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -124,6 +124,7 @@ Bug Fixes - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) - Bug in ``.to_records()`` when index name is a unicode string (:issue: `13172`) +- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) - Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 1a812ba2e4878..36f1f24fec6f7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -127,7 +127,7 @@ def __sizeof__(self): # no memory_usage attribute, so fall back to # object's 'sizeof' - return super(self, PandasObject).__sizeof__() + return super(PandasObject, self).__sizeof__() class NoNewAttributesMixin(object): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 2fec7c591a2b7..2b28e3b6ed8e0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -147,42 +147,46 @@ def test_values(self): class TestPandasDelegate(tm.TestCase): - def setUp(self): - pass + class Delegator(object): + _properties = ['foo'] + _methods = ['bar'] - def test_invalida_delgation(self): - # these show that in order for the delegation to work - # the _delegate_* methods need to be overriden to not raise a TypeError + def _set_foo(self, value): + self.foo = value - class Delegator(object): - _properties = ['foo'] - _methods = ['bar'] + def _get_foo(self): + return self.foo - def _set_foo(self, value): - self.foo = value + foo = property(_get_foo, _set_foo, doc="foo property") - def _get_foo(self): - return self.foo + def bar(self, *args, **kwargs): + """ a test bar method """ + pass - foo = property(_get_foo, _set_foo, doc="foo property") + class Delegate(PandasDelegate): - def bar(self, *args, **kwargs): - """ a test bar method """ - pass + def __init__(self, obj): + self.obj = obj - class Delegate(PandasDelegate): + def setUp(self): + pass - def __init__(self, obj): - self.obj = obj + def test_invalida_delgation(self): + # these show that in order for the delegation to work + # the _delegate_* methods need to be overriden to not raise a TypeError - Delegate._add_delegate_accessors(delegate=Delegator, - accessors=Delegator._properties, - typ='property') - Delegate._add_delegate_accessors(delegate=Delegator, - accessors=Delegator._methods, - typ='method') + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._properties, + typ='property' + ) + self.Delegate._add_delegate_accessors( + delegate=self.Delegator, + accessors=self.Delegator._methods, + typ='method' + ) - delegate = Delegate(Delegator()) + delegate = self.Delegate(self.Delegator()) def f(): delegate.foo @@ -199,6 +203,13 @@ def f(): self.assertRaises(TypeError, f) + def test_memory_usage(self): + # Delegate does not implement memory_usage. + # Check that we fall back to in-built `__sizeof__` + # GH 12924 + delegate = self.Delegate(self.Delegator()) + sys.getsizeof(delegate) + class Ops(tm.TestCase): From 2a120cf161ed3c71ee68063e9d13e3efd0d2c1e5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 19 May 2016 15:07:55 -0400 Subject: [PATCH 29/96] DOC: add v0.19.0 whatsnew doc --- doc/source/whatsnew/v0.19.0.txt | 83 +++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 doc/source/whatsnew/v0.19.0.txt diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt new file mode 100644 index 0000000000000..42db0388ca5d9 --- /dev/null +++ b/doc/source/whatsnew/v0.19.0.txt @@ -0,0 +1,83 @@ +.. _whatsnew_0190: + +v0.19.0 (????, 2016) +-------------------- + +This is a major release from 0.18.2 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.19.0 + :local: + :backlinks: none + +.. _whatsnew_0190.enhancements: + +New features +~~~~~~~~~~~~ + + + + + +.. _whatsnew_0190.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + + + + + + +.. _whatsnew_0190.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0190.api: + + + + + + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0190.deprecations: + +Deprecations +^^^^^^^^^^^^ + + + + + +.. _whatsnew_0190.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + + + + +.. _whatsnew_0190.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + +.. _whatsnew_0190.bug_fixes: + +Bug Fixes +~~~~~~~~~ From fecb2ca8559ceee5ce1e4ecd48c7e8a7560d4ce0 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Fri, 20 May 2016 09:12:50 -0400 Subject: [PATCH 30/96] COMPAT: Further Expand Compatibility with fromnumeric.py Follow-on to #12810 by expanding compatibility with fromnumeric.py in the following modules: 1) tslib.pyx 2) window.py 3) groupby.py and resample.py (shared classes) Closes #12811. Author: gfyoung Closes #13148 from gfyoung/fromnumeric-compat-continued and squashes the following commits: eb4762c [gfyoung] COMPAT: Expand compatibility with fromnumeric.py --- doc/source/whatsnew/v0.18.2.txt | 3 +- pandas/compat/numpy/function.py | 76 +++++++++++++++++++++- pandas/core/common.py | 4 ++ pandas/core/generic.py | 8 +-- pandas/core/groupby.py | 19 ++++-- pandas/core/window.py | 86 ++++++++++++++++--------- pandas/tests/test_groupby.py | 14 ++++ pandas/tests/test_window.py | 93 +++++++++++++++++++++++++++ pandas/tseries/resample.py | 11 +++- pandas/tseries/tests/test_resample.py | 19 +++++- pandas/tseries/tests/test_tslib.py | 28 ++++++++ pandas/tslib.pyx | 4 +- 12 files changed, 316 insertions(+), 49 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 61461be87801e..3aadfd73895e4 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -46,7 +46,8 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - +- Compat with ``np.round`` and timestamps (:issue:`12811`) +- An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) .. _whatsnew_0182.api.tolist: diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 069cb3638fe75..274761f5d0b9c 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,7 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_integer +from pandas.core.common import is_integer, UnsupportedFunctionCall from pandas.compat import OrderedDict @@ -245,3 +245,77 @@ def validate_transpose_for_generic(inst, kwargs): msg += " for {klass} instances".format(klass=klass) raise ValueError(msg) + + +def validate_window_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .{func}() directly instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_rolling_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .rolling(...).{func}() instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_expanding_func(name, args, kwargs): + numpy_args = ('axis', 'dtype', 'out') + msg = ("numpy operations are not " + "valid with window objects. " + "Use .expanding(...).{func}() instead ".format(func=name)) + + if len(args) > 0: + raise UnsupportedFunctionCall(msg) + + for arg in numpy_args: + if arg in kwargs: + raise UnsupportedFunctionCall(msg) + + +def validate_groupby_func(name, args, kwargs): + """ + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature + """ + if len(args) + len(kwargs) > 0: + raise UnsupportedFunctionCall(( + "numpy operations are not valid " + "with groupby. Use .groupby(...)." + "{func}() instead".format(func=name))) + +RESAMPLER_NUMPY_OPS = ('min', 'max', 'sum', 'prod', + 'mean', 'std', 'var') + + +def validate_resampler_func(method, args, kwargs): + """ + 'args' and 'kwargs' should be empty because all of + their necessary parameters are explicitly listed in + the function signature + """ + if len(args) + len(kwargs) > 0: + if method in RESAMPLER_NUMPY_OPS: + raise UnsupportedFunctionCall(( + "numpy operations are not valid " + "with resample. Use .resample(...)." + "{func}() instead".format(func=method))) + else: + raise TypeError("too many arguments passed in") diff --git a/pandas/core/common.py b/pandas/core/common.py index c64cfa77b9e62..64bfbdde0c5c3 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -41,6 +41,10 @@ class AmbiguousIndexError(PandasError, KeyError): pass +class UnsupportedFunctionCall(ValueError): + pass + + class AbstractMethodError(NotImplementedError): """Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c80ab9d87e33..99599d2b04a45 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5299,7 +5299,7 @@ def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_doc) def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - nv.validate_stat_func(tuple(), kwargs) + nv.validate_stat_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -5319,7 +5319,7 @@ def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): @Appender(_num_ddof_doc) def stat_func(self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs): - nv.validate_stat_ddof_func(tuple(), kwargs) + nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: @@ -5340,7 +5340,7 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, @Appender("Return cumulative {0} over requested axis.".format(name) + _cnum_doc) def cum_func(self, axis=None, dtype=None, out=None, skipna=True, **kwargs): - nv.validate_cum_func(tuple(), kwargs) + nv.validate_cum_func(tuple(), kwargs, fname=name) if axis is None: axis = self._stat_axis_number else: @@ -5374,7 +5374,7 @@ def _make_logical_function(cls, name, name1, name2, axis_descr, desc, f): @Appender(_bool_doc) def logical_func(self, axis=None, bool_only=None, skipna=None, level=None, **kwargs): - nv.validate_logical_func(tuple(), kwargs) + nv.validate_logical_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True if axis is None: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 424859da82877..2346be5c854f5 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -11,6 +11,7 @@ callable, map ) from pandas import compat +from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) @@ -954,12 +955,13 @@ def count(self): @Substitution(name='groupby') @Appender(_doc_template) - def mean(self): + def mean(self, *args, **kwargs): """ Compute mean of groups, excluding missing values For multiple groupings, the result index will be a MultiIndex """ + nv.validate_groupby_func('mean', args, kwargs) try: return self._cython_agg_general('mean') except GroupByError: @@ -993,7 +995,7 @@ def f(x): @Substitution(name='groupby') @Appender(_doc_template) - def std(self, ddof=1): + def std(self, ddof=1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values @@ -1005,12 +1007,13 @@ def std(self, ddof=1): degrees of freedom """ - # todo, implement at cython level? + # TODO: implement at Cython level? + nv.validate_groupby_func('std', args, kwargs) return np.sqrt(self.var(ddof=ddof)) @Substitution(name='groupby') @Appender(_doc_template) - def var(self, ddof=1): + def var(self, ddof=1, *args, **kwargs): """ Compute variance of groups, excluding missing values @@ -1021,7 +1024,7 @@ def var(self, ddof=1): ddof : integer, default 1 degrees of freedom """ - + nv.validate_groupby_func('var', args, kwargs) if ddof == 1: return self._cython_agg_general('var') else: @@ -1317,8 +1320,9 @@ def cumcount(self, ascending=True): @Substitution(name='groupby') @Appender(_doc_template) - def cumprod(self, axis=0): + def cumprod(self, axis=0, *args, **kwargs): """Cumulative product for each group""" + nv.validate_groupby_func('cumprod', args, kwargs) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis)) @@ -1326,8 +1330,9 @@ def cumprod(self, axis=0): @Substitution(name='groupby') @Appender(_doc_template) - def cumsum(self, axis=0): + def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" + nv.validate_groupby_func('cumsum', args, kwargs) if axis != 0: return self.apply(lambda x: x.cumprod(axis=axis)) diff --git a/pandas/core/window.py b/pandas/core/window.py index eb0d996436661..cd66d4e30c351 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -18,6 +18,7 @@ import pandas.core.common as com import pandas.algos as algos from pandas import compat +from pandas.compat.numpy import function as nv from pandas.util.decorators import Substitution, Appender from textwrap import dedent @@ -435,13 +436,15 @@ def aggregate(self, arg, *args, **kwargs): @Substitution(name='window') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): + def sum(self, *args, **kwargs): + nv.validate_window_func('sum', args, kwargs) return self._apply_window(mean=False, **kwargs) @Substitution(name='window') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): + nv.validate_window_func('mean', args, kwargs) return self._apply_window(mean=True, **kwargs) @@ -620,7 +623,8 @@ def f(arg, window, min_periods): return self._apply(f, func, args=args, kwargs=kwargs, center=False) - def sum(self, **kwargs): + def sum(self, *args, **kwargs): + nv.validate_window_func('sum', args, kwargs) return self._apply('roll_sum', 'sum', **kwargs) _shared_docs['max'] = dedent(""" @@ -631,7 +635,8 @@ def sum(self, **kwargs): how : string, default 'max' (DEPRECATED) Method for down- or re-sampling""") - def max(self, how=None, **kwargs): + def max(self, how=None, *args, **kwargs): + nv.validate_window_func('max', args, kwargs) if self.freq is not None and how is None: how = 'max' return self._apply('roll_max', 'max', how=how, **kwargs) @@ -644,12 +649,14 @@ def max(self, how=None, **kwargs): how : string, default 'min' (DEPRECATED) Method for down- or re-sampling""") - def min(self, how=None, **kwargs): + def min(self, how=None, *args, **kwargs): + nv.validate_window_func('min', args, kwargs) if self.freq is not None and how is None: how = 'min' return self._apply('roll_min', 'min', how=how, **kwargs) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): + nv.validate_window_func('mean', args, kwargs) return self._apply('roll_mean', 'mean', **kwargs) _shared_docs['median'] = dedent(""" @@ -674,7 +681,8 @@ def median(self, how=None, **kwargs): Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_window_func('std', args, kwargs) window = self._get_window() def f(arg, *args, **kwargs): @@ -693,7 +701,8 @@ def f(arg, *args, **kwargs): Delta Degrees of Freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements.""") - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_window_func('var', args, kwargs) return self._apply('roll_var', 'var', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -865,26 +874,30 @@ def apply(self, func, args=(), kwargs={}): @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): - return super(Rolling, self).sum(**kwargs) + def sum(self, *args, **kwargs): + nv.validate_rolling_func('sum', args, kwargs) + return super(Rolling, self).sum(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['max']) - def max(self, **kwargs): - return super(Rolling, self).max(**kwargs) + def max(self, *args, **kwargs): + nv.validate_rolling_func('max', args, kwargs) + return super(Rolling, self).max(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['min']) - def min(self, **kwargs): - return super(Rolling, self).min(**kwargs) + def min(self, *args, **kwargs): + nv.validate_rolling_func('min', args, kwargs) + return super(Rolling, self).min(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): - return super(Rolling, self).mean(**kwargs) + def mean(self, *args, **kwargs): + nv.validate_rolling_func('mean', args, kwargs) + return super(Rolling, self).mean(*args, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @@ -895,13 +908,15 @@ def median(self, **kwargs): @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['std']) - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func('std', args, kwargs) return super(Rolling, self).std(ddof=ddof, **kwargs) @Substitution(name='rolling') @Appender(_doc_template) @Appender(_shared_docs['var']) - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_rolling_func('var', args, kwargs) return super(Rolling, self).var(ddof=ddof, **kwargs) @Substitution(name='rolling') @@ -1023,26 +1038,30 @@ def apply(self, func, args=(), kwargs={}): @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['sum']) - def sum(self, **kwargs): - return super(Expanding, self).sum(**kwargs) + def sum(self, *args, **kwargs): + nv.validate_expanding_func('sum', args, kwargs) + return super(Expanding, self).sum(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['max']) - def max(self, **kwargs): - return super(Expanding, self).max(**kwargs) + def max(self, *args, **kwargs): + nv.validate_expanding_func('max', args, kwargs) + return super(Expanding, self).max(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['min']) - def min(self, **kwargs): - return super(Expanding, self).min(**kwargs) + def min(self, *args, **kwargs): + nv.validate_expanding_func('min', args, kwargs) + return super(Expanding, self).min(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['mean']) - def mean(self, **kwargs): - return super(Expanding, self).mean(**kwargs) + def mean(self, *args, **kwargs): + nv.validate_expanding_func('mean', args, kwargs) + return super(Expanding, self).mean(*args, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @@ -1053,13 +1072,15 @@ def median(self, **kwargs): @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['std']) - def std(self, ddof=1, **kwargs): + def std(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func('std', args, kwargs) return super(Expanding, self).std(ddof=ddof, **kwargs) @Substitution(name='expanding') @Appender(_doc_template) @Appender(_shared_docs['var']) - def var(self, ddof=1, **kwargs): + def var(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func('var', args, kwargs) return super(Expanding, self).var(ddof=ddof, **kwargs) @Substitution(name='expanding') @@ -1273,15 +1294,17 @@ def func(arg): @Substitution(name='ewm') @Appender(_doc_template) - def mean(self, **kwargs): + def mean(self, *args, **kwargs): """exponential weighted moving average""" + nv.validate_window_func('mean', args, kwargs) return self._apply('ewma', **kwargs) @Substitution(name='ewm') @Appender(_doc_template) @Appender(_bias_template) - def std(self, bias=False, **kwargs): + def std(self, bias=False, *args, **kwargs): """exponential weighted moving stddev""" + nv.validate_window_func('std', args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) vol = std @@ -1289,8 +1312,9 @@ def std(self, bias=False, **kwargs): @Substitution(name='ewm') @Appender(_doc_template) @Appender(_bias_template) - def var(self, bias=False, **kwargs): + def var(self, bias=False, *args, **kwargs): """exponential weighted moving variance""" + nv.validate_window_func('var', args, kwargs) def f(arg): return algos.ewmcov(arg, arg, self.com, int(self.adjust), diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 571b0fa1ee78f..74048536bd1f3 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -8,6 +8,7 @@ from pandas import date_range, bdate_range, Timestamp from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import Categorical, DataFrame +from pandas.core.common import UnsupportedFunctionCall from pandas.core.groupby import (SpecificationError, DataError, _nargsort, _lexsort_indexer) from pandas.core.series import Series @@ -6393,6 +6394,19 @@ def test_transform_with_non_scalar_group(self): (axis=1, level=1).transform, lambda z: z.div(z.sum(axis=1), axis=0)) + def test_numpy_compat(self): + # see gh-12811 + df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) + g = df.groupby('A') + + msg = "numpy operations are not valid with groupby" + + for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(g, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(g, func), foo=1) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index a043e92bd2c76..8d9a55bade30d 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -20,6 +20,7 @@ import pandas.stats.moments as mom import pandas.core.window as rwindow from pandas.core.base import SpecificationError +from pandas.core.common import UnsupportedFunctionCall import pandas.util.testing as tm from pandas.compat import range, zip, PY3 @@ -296,6 +297,18 @@ def test_constructor(self): with self.assertRaises(ValueError): c(win_type=wt, window=2) + def test_numpy_compat(self): + # see gh-12811 + w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) + + msg = "numpy operations are not valid with window objects" + + for func in ('sum', 'mean'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(w, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(w, func), dtype=np.float64) + class TestRolling(Base): @@ -323,6 +336,18 @@ def test_constructor(self): with self.assertRaises(ValueError): c(window=2, min_periods=1, center=w) + def test_numpy_compat(self): + # see gh-12811 + r = rwindow.Rolling(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), dtype=np.float64) + class TestExpanding(Base): @@ -347,6 +372,74 @@ def test_constructor(self): with self.assertRaises(ValueError): c(min_periods=1, center=w) + def test_numpy_compat(self): + # see gh-12811 + e = rwindow.Expanding(Series([2, 4, 6]), window=2) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'sum', 'max', 'min', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), dtype=np.float64) + + +class TestEWM(Base): + + def setUp(self): + self._create_data() + + def test_constructor(self): + for o in [self.series, self.frame]: + c = o.ewm + + # valid + c(com=0.5) + c(span=1.5) + c(alpha=0.5) + c(halflife=0.75) + c(com=0.5, span=None) + c(alpha=0.5, com=None) + c(halflife=0.75, alpha=None) + + # not valid: mutually exclusive + with self.assertRaises(ValueError): + c(com=0.5, alpha=0.5) + with self.assertRaises(ValueError): + c(span=1.5, halflife=0.75) + with self.assertRaises(ValueError): + c(alpha=0.5, span=1.5) + + # not valid: com < 0 + with self.assertRaises(ValueError): + c(com=-0.5) + + # not valid: span < 1 + with self.assertRaises(ValueError): + c(span=0.5) + + # not valid: halflife <= 0 + with self.assertRaises(ValueError): + c(halflife=0) + + # not valid: alpha <= 0 or alpha > 1 + for alpha in (-0.5, 1.5): + with self.assertRaises(ValueError): + c(alpha=alpha) + + def test_numpy_compat(self): + # see gh-12811 + e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) + + msg = "numpy operations are not valid with window objects" + + for func in ('std', 'mean', 'var'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(e, func), dtype=np.float64) + class TestDeprecations(Base): """ test that we are catching deprecation warnings """ diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index bb7915e978c3e..ac30db35c0f85 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -16,7 +16,9 @@ from pandas.tseries.period import PeriodIndex, period_range import pandas.core.common as com import pandas.core.algorithms as algos + import pandas.compat as compat +from pandas.compat.numpy import function as nv from pandas.lib import Timestamp import pandas.lib as lib @@ -480,7 +482,7 @@ def asfreq(self): """ return self._upsample('asfreq') - def std(self, ddof=1): + def std(self, ddof=1, *args, **kwargs): """ Compute standard deviation of groups, excluding missing values @@ -489,9 +491,10 @@ def std(self, ddof=1): ddof : integer, default 1 degrees of freedom """ + nv.validate_resampler_func('std', args, kwargs) return self._downsample('std', ddof=ddof) - def var(self, ddof=1): + def var(self, ddof=1, *args, **kwargs): """ Compute variance of groups, excluding missing values @@ -500,6 +503,7 @@ def var(self, ddof=1): ddof : integer, default 1 degrees of freedom """ + nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) Resampler._deprecated_valids += dir(Resampler) @@ -507,7 +511,8 @@ def var(self, ddof=1): for method in ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem', 'median', 'prod', 'ohlc']: - def f(self, _method=method): + def f(self, _method=method, *args, **kwargs): + nv.validate_resampler_func(_method, args, kwargs) return self._downsample(_method) f.__doc__ = getattr(GroupBy, method).__doc__ setattr(Resampler, method, f) diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index dd5ab36d10a45..27b15a412ae37 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -13,7 +13,8 @@ notnull, Timestamp) from pandas.compat import range, lrange, zip, product, OrderedDict from pandas.core.base import SpecificationError -from pandas.core.common import ABCSeries, ABCDataFrame +from pandas.core.common import (ABCSeries, ABCDataFrame, + UnsupportedFunctionCall) from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS from pandas.tseries.index import date_range @@ -746,6 +747,22 @@ def _ohlc(group): exc.args += ('how=%s' % arg,) raise + def test_numpy_compat(self): + # see gh-12811 + s = Series([1, 2, 3, 4, 5], index=date_range( + '20130101', periods=5, freq='s')) + r = s.resample('2s') + + msg = "numpy operations are not valid with resample" + + for func in ('min', 'max', 'sum', 'prod', + 'mean', 'var', 'std'): + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), + func, 1, 2, 3) + tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, + getattr(r, func), axis=1) + def test_resample_how_callables(self): # GH 7929 data = np.arange(5, dtype=np.int64) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 4543047a8a72a..79f9c60c9deb7 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1290,6 +1290,34 @@ def test_shift_months(self): years=years, months=months) for x in s]) tm.assert_index_equal(actual, expected) + def test_round(self): + # see gh-12811 + stamp = Timestamp('2000-01-05 05:09:15.13') + + def _check_round(freq, expected): + result = stamp.round(freq=freq) + npResult = np.round(stamp, freq) + self.assertEqual(result, expected) + self.assertEqual(npResult, expected) + + for freq, expected in [ + ('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15')) + ]: + _check_round(freq, expected) + + msg = "the 'out' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.round, + stamp, 'D', out=[]) + + # 'freq' is a required parameter, so we cannot + # assign a default should the user accidentally + # assign a 'decimals' input instead + msg = "Could not evaluate" + tm.assertRaisesRegexp(ValueError, msg, np.round, + stamp, 2) + class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index a240558025090..281a74d640292 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -330,7 +330,7 @@ class Timestamp(_Timestamp): result = result.tz_localize(self.tz) return result - def round(self, freq): + def round(self, freq, *args, **kwargs): """ Round the Timestamp to the specified resolution @@ -346,6 +346,8 @@ class Timestamp(_Timestamp): ------ ValueError if the freq cannot be converted """ + from pandas.compat.numpy.function import validate_round + validate_round(args, kwargs) return self._round(freq, np.round) def floor(self, freq): From 123f2ee16d713c94f41d2a85945c8df0a2244061 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 20 May 2016 10:06:07 -0400 Subject: [PATCH 31/96] BUG: Bug in .to_datetime() when passing integers or floats, no unit and errors=coerce closes #13180 Author: Jeff Reback Closes #13183 from jreback/coerce and squashes the following commits: 0076151 [Jeff Reback] BUG: Bug in .to_datetime() when passing integers or floats, no unit and errors=coerce --- doc/source/whatsnew/v0.18.2.txt | 14 +- pandas/tests/series/test_internals.py | 5 +- pandas/tseries/tests/test_timeseries.py | 188 ++++++++++++++++-------- pandas/tseries/tools.py | 3 +- pandas/tslib.pyx | 47 ++++-- 5 files changed, 176 insertions(+), 81 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 3aadfd73895e4..21ed92218e407 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -77,13 +77,26 @@ New Behavior: type(s.tolist()[0]) +.. _whatsnew_0182.api.to_datetime_coerce: +``.to_datetime()`` when coercing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. +Previous Behavior: +.. code-block:: ipython + In [2]: pd.to_datetime([1, 'foo'], errors='coerce') + Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) +This will now convert integers/floats with the default unit of ``ns``. +.. ipython:: python + + pd.to_datetime([1, 'foo'], errors='coerce') .. _whatsnew_0182.api.other: @@ -139,7 +152,6 @@ Bug Fixes - - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index 93bd7f0eec7c5..e3a0e056f4da1 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -103,7 +103,8 @@ def test_convert_objects(self): with tm.assert_produces_warning(FutureWarning): result = s.convert_objects(convert_dates='coerce', convert_numeric=False) - assert_series_equal(result, s) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) + assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') @@ -270,7 +271,7 @@ def test_convert(self): s = Series(['foo', 'bar', 1, 1.0], dtype='O') result = s._convert(datetime=True, coerce=True) - expected = Series([lib.NaT] * 4) + expected = Series([lib.NaT] * 2 + [Timestamp(1)] * 2) assert_series_equal(result, expected) # preserver if non-object diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 3d8e389ba30f2..880713964ec90 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -762,6 +762,15 @@ def test_to_datetime_unit(self): with self.assertRaises(ValueError): to_datetime([1, 2, 111111111], unit='D') + # coerce we can process + expected = DatetimeIndex([Timestamp('1970-01-02'), + Timestamp('1970-01-03')] + ['NaT'] * 1) + result = to_datetime([1, 2, 'foo'], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + + result = to_datetime([1, 2, 111111111], unit='D', errors='coerce') + tm.assert_index_equal(result, expected) + def test_series_ctor_datetime64(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') dates = np.asarray(rng) @@ -2283,6 +2292,123 @@ def test_to_datetime_tz_psycopg2(self): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) + def test_unit(self): + # GH 11758 + # test proper behavior with erros + + with self.assertRaises(ValueError): + to_datetime([1], unit='D', format='%Y%m%d') + + values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, + 'NaT', ''] + result = to_datetime(values, unit='D', errors='ignore') + expected = Index([11111111, Timestamp('1970-01-02'), + Timestamp('1970-01-02'), pd.NaT, + pd.NaT, pd.NaT, pd.NaT, pd.NaT], + dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, unit='D', errors='coerce') + expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', + 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, unit='D', errors='raise') + + values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] + + result = to_datetime(values, errors='ignore', unit='s') + expected = Index([1420043460000, pd.NaT, pd.NaT, + pd.NaT, pd.NaT], dtype=object) + tm.assert_index_equal(result, expected) + + result = to_datetime(values, errors='coerce', unit='s') + expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) + tm.assert_index_equal(result, expected) + + with self.assertRaises(tslib.OutOfBoundsDatetime): + to_datetime(values, errors='raise', unit='s') + + # if we have a string, then we raise a ValueError + # and NOT an OutOfBoundsDatetime + for val in ['foo', Timestamp('20130101')]: + try: + to_datetime(val, errors='raise', unit='s') + except tslib.OutOfBoundsDatetime: + raise AssertionError("incorrect exception raised") + except ValueError: + pass + + def test_unit_consistency(self): + + # consistency of conversions + expected = Timestamp('1970-05-09 14:25:11') + result = pd.to_datetime(11111111, unit='s', errors='raise') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='coerce') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + result = pd.to_datetime(11111111, unit='s', errors='ignore') + self.assertEqual(result, expected) + self.assertIsInstance(result, Timestamp) + + def test_unit_with_numeric(self): + + # GH 13180 + # coercions from floats/ints are ok + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr1 = [1.434692e+18, 1.432766e+18] + arr2 = np.array(arr1).astype(int) + for errors in ['ignore', 'raise', 'coerce']: + result = pd.to_datetime(arr1, errors=errors) + tm.assert_index_equal(result, expected) + + result = pd.to_datetime(arr2, errors=errors) + tm.assert_index_equal(result, expected) + + # but we want to make sure that we are coercing + # if we have ints/strings + expected = DatetimeIndex(['NaT', + '2015-06-19 05:33:20', + '2015-05-27 22:33:20']) + arr = ['foo', 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + expected = DatetimeIndex(['2015-06-19 05:33:20', + '2015-05-27 22:33:20', + 'NaT', + 'NaT']) + arr = [1.434692e+18, 1.432766e+18, 'foo', 'NaT'] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + def test_unit_mixed(self): + + # mixed integers/datetimes + expected = DatetimeIndex(['2013-01-01', 'NaT', 'NaT']) + arr = [pd.Timestamp('20130101'), 1.434692e+18, 1.432766e+18] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + + expected = DatetimeIndex(['NaT', + 'NaT', + '2013-01-01']) + arr = [1.434692e+18, 1.432766e+18, pd.Timestamp('20130101')] + result = pd.to_datetime(arr, errors='coerce') + tm.assert_index_equal(result, expected) + + with self.assertRaises(ValueError): + pd.to_datetime(arr, errors='raise') + def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) @@ -4229,68 +4355,6 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) - def test_unit_errors(self): - # GH 11758 - # test proper behavior with erros - - with self.assertRaises(ValueError): - to_datetime([1], unit='D', format='%Y%m%d') - - values = [11111111, 1, 1.0, tslib.iNaT, pd.NaT, np.nan, - 'NaT', ''] - result = to_datetime(values, unit='D', errors='ignore') - expected = Index([11111111, Timestamp('1970-01-02'), - Timestamp('1970-01-02'), pd.NaT, - pd.NaT, pd.NaT, pd.NaT, pd.NaT], - dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, unit='D', errors='coerce') - expected = DatetimeIndex(['NaT', '1970-01-02', '1970-01-02', - 'NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, unit='D', errors='raise') - - values = [1420043460000, tslib.iNaT, pd.NaT, np.nan, 'NaT'] - - result = to_datetime(values, errors='ignore', unit='s') - expected = Index([1420043460000, pd.NaT, pd.NaT, - pd.NaT, pd.NaT], dtype=object) - tm.assert_index_equal(result, expected) - - result = to_datetime(values, errors='coerce', unit='s') - expected = DatetimeIndex(['NaT', 'NaT', 'NaT', 'NaT', 'NaT']) - tm.assert_index_equal(result, expected) - - with self.assertRaises(tslib.OutOfBoundsDatetime): - to_datetime(values, errors='raise', unit='s') - - # if we have a string, then we raise a ValueError - # and NOT an OutOfBoundsDatetime - for val in ['foo', Timestamp('20130101')]: - try: - to_datetime(val, errors='raise', unit='s') - except tslib.OutOfBoundsDatetime: - raise AssertionError("incorrect exception raised") - except ValueError: - pass - - # consistency of conversions - expected = Timestamp('1970-05-09 14:25:11') - result = pd.to_datetime(11111111, unit='s', errors='raise') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='coerce') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - - result = pd.to_datetime(11111111, unit='s', errors='ignore') - self.assertEqual(result, expected) - self.assertIsInstance(result, Timestamp) - def test_roundtrip(self): # test value to string and back conversions diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index a46149035dbae..d5e87d1df2462 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -221,7 +221,8 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. - unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch + unit : string, default 'ns' + unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 281a74d640292..d3fd7a807aaef 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2084,6 +2084,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): unit)) elif is_ignore: raise AssertionError + iresult[i] = NPY_NAT except: if is_raise: raise OutOfBoundsDatetime("cannot convert input {0}" @@ -2151,7 +2152,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_datetime=0 + bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2217,25 +2218,32 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # if we are coercing, dont' allow integers - elif is_integer_object(val) and not is_coerce: - if val == NPY_NAT: + # these must be ns unit by-definition + elif is_integer_object(val) or is_float_object(val): + + if val != val or val == NPY_NAT: iresult[i] = NPY_NAT - else: + elif is_raise or is_ignore: iresult[i] = val seen_integer=1 - elif is_float_object(val) and not is_coerce: - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT else: - iresult[i] = val - seen_integer=1 + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + seen_integer = 1 + try: + iresult[i] = cast_from_unit(val, 'ns') + except: + iresult[i] = NPY_NAT else: try: if len(val) == 0 or val in _nat_strings: iresult[i] = NPY_NAT continue + seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) if out_local == 1: @@ -2278,11 +2286,20 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # don't allow mixed integers and datetime like - # higher levels can catch and is_coerce to object, for - # example - if seen_integer and seen_datetime: - raise ValueError("mixed datetimes and integers in passed array") + if seen_datetime and seen_integer: + # we have mixed datetimes & integers + + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + raise TypeError return result except OutOfBoundsDatetime: From cc25040798e016fbcdbba45a927deabedd84ea37 Mon Sep 17 00:00:00 2001 From: adneu Date: Fri, 20 May 2016 10:08:20 -0400 Subject: [PATCH 32/96] =?UTF-8?q?BUG:=20GH12824=20fixed=20apply()=20return?= =?UTF-8?q?s=20different=20result=20depending=20on=20whet=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #12824 Author: adneu Closes #12977 from adneu/GH12824 and squashes the following commits: 9cf7e01 [adneu] BUG: GH12824 fixed apply() returns different result depending on whether first result is None --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/groupby.py | 45 ++++++++++++++++++++------------- pandas/tests/test_groupby.py | 23 +++++++++++++++++ 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 21ed92218e407..585522fb469cd 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -167,3 +167,4 @@ Bug Fixes - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) +- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2346be5c854f5..bc02d8c49f3ae 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -807,8 +807,9 @@ def reset_identity(values): # reset the identities of the components # of the values to prevent aliasing for v in values: - ax = v._get_axis(self.axis) - ax._reset_identity() + if v is not None: + ax = v._get_axis(self.axis) + ax._reset_identity() return values if not not_indexed_same: @@ -3226,7 +3227,21 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_names = self.grouper.names - if isinstance(values[0], DataFrame): + # GH12824. + def first_non_None_value(values): + try: + v = next(v for v in values if v is not None) + except StopIteration: + return None + return v + + v = first_non_None_value(values) + + if v is None: + # GH9684. If all values are None, then this will throw an error. + # We'd prefer it return an empty dataframe. + return DataFrame() + elif isinstance(v, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: @@ -3253,21 +3268,15 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_index = None # make Nones an empty object - if com._count_not_none(*values) != len(values): - try: - v = next(v for v in values if v is not None) - except StopIteration: - # If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. - return DataFrame() - if v is None: - return DataFrame() - elif isinstance(v, NDFrame): - values = [ - x if x is not None else - v._constructor(**v._construct_axes_dict()) - for x in values - ] + v = first_non_None_value(values) + if v is None: + return DataFrame() + elif isinstance(v, NDFrame): + values = [ + x if x is not None else + v._constructor(**v._construct_axes_dict()) + for x in values + ] v = values[0] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 74048536bd1f3..9cb070c0cd926 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6316,6 +6316,29 @@ def test_func(x): expected = DataFrame() tm.assert_frame_equal(result, expected) + def test_groupby_apply_none_first(self): + # GH 12824. Tests if apply returns None first. + test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]}) + test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]}) + + def test_func(x): + if x.shape[0] < 2: + return None + return x.iloc[[0, -1]] + + result1 = test_df1.groupby('groups').apply(test_func) + result2 = test_df2.groupby('groups').apply(test_func) + index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], + names=['groups', None]) + index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], + names=['groups', None]) + expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]}, + index=index1) + expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]}, + index=index2) + tm.assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result2, expected2) + def test_first_last_max_min_on_time_data(self): # GH 10295 # Verify that NaT is not in the result of max, min, first and last on From 72164a8471be0e9f41476ae094a3b46479c7a6d2 Mon Sep 17 00:00:00 2001 From: John Freeman Date: Fri, 20 May 2016 10:11:36 -0400 Subject: [PATCH 33/96] API/COMPAT: add pydatetime-style positional args to Timestamp constructor closes #10758 closes #11630 Author: John Freeman Closes #12482 from thejohnfreeman/GH-10758 and squashes the following commits: 79d63d4 [John Freeman] Format comments, test for NaT 0ac786f [John Freeman] Special object for missing first argument a334eab [John Freeman] PEP8 9c1e2dc [John Freeman] review fixes: versionadded, issue links, repr tests, legacy 6fad30b [John Freeman] Fix docstring; add tests for kwargs 5c34c04 [John Freeman] Support positional and keyword arguments for Timestamp 0d6884b [John Freeman] API/COMPAT: add pydatetime-style positional args to Timestamp constructor --- doc/source/timeseries.rst | 1 + doc/source/whatsnew/v0.18.2.txt | 8 ++++ pandas/tseries/tests/test_tslib.py | 46 ++++++++++++++++++++++ pandas/tslib.pyx | 61 +++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 2 deletions(-) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 114607f117756..62601821488d3 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -98,6 +98,7 @@ time. pd.Timestamp(datetime(2012, 5, 1)) pd.Timestamp('2012-05-01') + pd.Timestamp(2012, 5, 1) However, in many cases it is more natural to associate things like change variables with a time span instead. The span represented by ``Period`` can be diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 585522fb469cd..251fb4c139b04 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -39,6 +39,14 @@ Other enhancements idx = pd.Index(["a1a2", "b1", "c1"]) idx.str.extractall("[ab](?P\d)") +- ``Timestamp``s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) + + .. ipython:: python + + pd.Timestamp(2012, 1, 1) + + pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) + .. _whatsnew_0182.api: API changes diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 79f9c60c9deb7..bf546bd9d1a7e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -180,6 +180,52 @@ def test_constructor_invalid(self): with tm.assertRaisesRegexp(ValueError, 'Cannot convert Period'): Timestamp(Period('1000-01-01')) + def test_constructor_positional(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(2000, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 0, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 13, 1) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 0) + with tm.assertRaises(ValueError): + Timestamp(2000, 1, 32) + + # GH 11630 + self.assertEqual( + repr(Timestamp(2015, 11, 12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(2015, 11, 12, 1, 2, 3, 999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + + self.assertIs(Timestamp(None), pd.NaT) + + def test_constructor_keyword(self): + # GH 10758 + with tm.assertRaises(TypeError): + Timestamp(year=2000, month=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=0, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=13, day=1) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=0) + with tm.assertRaises(ValueError): + Timestamp(year=2000, month=1, day=32) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12)), + repr(Timestamp('20151112'))) + + self.assertEqual( + repr(Timestamp(year=2015, month=11, day=12, + hour=1, minute=2, second=3, microsecond=999999)), + repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index d3fd7a807aaef..6c6707121c24a 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -214,8 +214,8 @@ cdef inline bint _is_fixed_offset(object tz): return 0 return 1 - _zero_time = datetime_time(0, 0) +_no_input = object() # Python front end to C extension type _Timestamp # This serves as the box for datetime64 @@ -225,6 +225,10 @@ class Timestamp(_Timestamp): for the entries that make up a DatetimeIndex, and other timeseries oriented data structures in pandas. + There are essentially three calling conventions for the constructor. The + primary form accepts four parameters. They can be passed by position or + keyword. + Parameters ---------- ts_input : datetime-like, str, int, float @@ -235,6 +239,23 @@ class Timestamp(_Timestamp): Time zone for time which Timestamp will have. unit : string numpy unit used for conversion, if ts_input is int or float + + The other two forms mimic the parameters from ``datetime.datetime``. They + can be passed by either position or keyword, but not both mixed together. + + :func:`datetime.datetime` Parameters + ------------------------------------ + + .. versionadded:: 0.18.2 + + year : int + month : int + day : int + hour : int, optional, default is 0 + minute : int, optional, default is 0 + second : int, optional, default is 0 + microsecond : int, optional, default is 0 + tzinfo : datetime.tzinfo, optional, default is None """ @classmethod @@ -288,10 +309,46 @@ class Timestamp(_Timestamp): def combine(cls, date, time): return cls(datetime.combine(date, time)) - def __new__(cls, object ts_input, object offset=None, tz=None, unit=None): + def __new__(cls, + object ts_input=_no_input, object offset=None, tz=None, unit=None, + year=None, month=None, day=None, + hour=None, minute=None, second=None, microsecond=None, + tzinfo=None): + # The parameter list folds together legacy parameter names (the first + # four) and positional and keyword parameter names from pydatetime. + # + # There are three calling forms: + # + # - In the legacy form, the first parameter, ts_input, is required + # and may be datetime-like, str, int, or float. The second + # parameter, offset, is optional and may be str or DateOffset. + # + # - ints in the first, second, and third arguments indicate + # pydatetime positional arguments. Only the first 8 arguments + # (standing in for year, month, day, hour, minute, second, + # microsecond, tzinfo) may be non-None. As a shortcut, we just + # check that the second argument is an int. + # + # - Nones for the first four (legacy) arguments indicate pydatetime + # keyword arguments. year, month, and day are required. As a + # shortcut, we just check that the first argument was not passed. + # + # Mixing pydatetime positional and keyword arguments is forbidden! + cdef _TSObject ts cdef _Timestamp ts_base + if ts_input is _no_input: + # User passed keyword arguments. + return Timestamp(datetime(year, month, day, hour or 0, + minute or 0, second or 0, microsecond or 0, tzinfo), + tz=tzinfo) + elif is_integer_object(offset): + # User passed positional arguments: + # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) + return Timestamp(datetime(ts_input, offset, tz, unit or 0, + year or 0, month or 0, day or 0, hour), tz=hour) + ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) if ts.value == NPY_NAT: From 9d44e637db603417be2716f6c1902527085ec0c5 Mon Sep 17 00:00:00 2001 From: Thomas A Caswell Date: Tue, 10 May 2016 19:47:02 -0400 Subject: [PATCH 34/96] BUG: mpl fix to AutoDatFromatter to fix second/us-second formatters revert part of #11770 xref: https://github.com/matplotlib/matplotlib/issues/6365 closes #13131 The mistake in #11770 was missing that pandas had a 1/us not 1/s scaled bucket. --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/tseries/converter.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 251fb4c139b04..907ca6f185e0a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -141,6 +141,7 @@ Bug Fixes - Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) - Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) +- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 8ccfdfa05e9b5..78b185ae8cf31 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -23,6 +23,24 @@ from pandas.tseries.frequencies import FreqGroup from pandas.tseries.period import Period, PeriodIndex +# constants +HOURS_PER_DAY = 24. +MIN_PER_HOUR = 60. +SEC_PER_MIN = 60. + +SEC_PER_HOUR = SEC_PER_MIN * MIN_PER_HOUR +SEC_PER_DAY = SEC_PER_HOUR * HOURS_PER_DAY + +MUSEC_PER_DAY = 1e6 * SEC_PER_DAY + + +def _mpl_le_2_0_0(): + try: + import matplotlib + return matplotlib.compare_versions('2.0.0', matplotlib.__version__) + except ImportError: + return False + def register(): units.registry[lib.Timestamp] = DatetimeConverter() @@ -221,6 +239,13 @@ def __init__(self, locator, tz=None, defaultfmt='%Y-%m-%d'): if self._tz is dates.UTC: self._tz._utcoffset = self._tz.utcoffset(None) + # For mpl > 2.0 the format strings are controlled via rcparams + # so do not mess with them. For mpl < 2.0 change the second + # break point and add a musec break point + if _mpl_le_2_0_0(): + self.scaled[1. / SEC_PER_DAY] = '%H:%M:%S' + self.scaled[1. / MUSEC_PER_DAY] = '%H:%M:%S.%f' + class PandasAutoDateLocator(dates.AutoDateLocator): From 8e2f70b749185ebbde4bfc9a09adee1cc2bb47d1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 20 May 2016 11:52:29 -0400 Subject: [PATCH 35/96] TST: xref #13183, for windows compat --- pandas/tseries/tests/test_timeseries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 880713964ec90..46f02c718a09f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2363,7 +2363,7 @@ def test_unit_with_numeric(self): expected = DatetimeIndex(['2015-06-19 05:33:20', '2015-05-27 22:33:20']) arr1 = [1.434692e+18, 1.432766e+18] - arr2 = np.array(arr1).astype(int) + arr2 = np.array(arr1).astype('int64') for errors in ['ignore', 'raise', 'coerce']: result = pd.to_datetime(arr1, errors=errors) tm.assert_index_equal(result, expected) From f5c24d2b284b133ca8134c480930617a5918f130 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 21 May 2016 10:04:15 -0400 Subject: [PATCH 36/96] Reverse numpy compat changes to tslib.pyx Title is self-explanatory. xref #13148. Author: gfyoung Closes #13246 from gfyoung/tslib-compat-undo and squashes the following commits: 66160f9 [gfyoung] Reverse numpy compat changes to tslib.pyx --- doc/source/whatsnew/v0.18.2.txt | 1 - pandas/tseries/tests/test_tslib.py | 14 ++------------ pandas/tslib.pyx | 4 +--- 3 files changed, 3 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 907ca6f185e0a..5b4a4981ab8ad 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -54,7 +54,6 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- Compat with ``np.round`` and timestamps (:issue:`12811`) - An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) .. _whatsnew_0182.api.tolist: diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index bf546bd9d1a7e..8414a5ed42991 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -1337,14 +1337,11 @@ def test_shift_months(self): tm.assert_index_equal(actual, expected) def test_round(self): - # see gh-12811 stamp = Timestamp('2000-01-05 05:09:15.13') def _check_round(freq, expected): result = stamp.round(freq=freq) - npResult = np.round(stamp, freq) self.assertEqual(result, expected) - self.assertEqual(npResult, expected) for freq, expected in [ ('D', Timestamp('2000-01-05 00:00:00')), @@ -1353,16 +1350,9 @@ def _check_round(freq, expected): ]: _check_round(freq, expected) - msg = "the 'out' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.round, - stamp, 'D', out=[]) - - # 'freq' is a required parameter, so we cannot - # assign a default should the user accidentally - # assign a 'decimals' input instead msg = "Could not evaluate" - tm.assertRaisesRegexp(ValueError, msg, np.round, - stamp, 2) + tm.assertRaisesRegexp(ValueError, msg, + stamp.round, 'foo') class TestTimestampOps(tm.TestCase): diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 6c6707121c24a..f5301d3746e8b 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -387,7 +387,7 @@ class Timestamp(_Timestamp): result = result.tz_localize(self.tz) return result - def round(self, freq, *args, **kwargs): + def round(self, freq): """ Round the Timestamp to the specified resolution @@ -403,8 +403,6 @@ class Timestamp(_Timestamp): ------ ValueError if the freq cannot be converted """ - from pandas.compat.numpy.function import validate_round - validate_round(args, kwargs) return self._round(freq, np.round) def floor(self, freq): From d2b581960168502b1f7dfd73cedfe03ffbf91aee Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 21 May 2016 10:08:27 -0400 Subject: [PATCH 37/96] BUG: Empty PeriodIndex issues closes #13067 closes #13212 Author: Maximilian Roos Closes #13079 from MaximilianR/period_resample_0 and squashes the following commits: 8c7b9db [Maximilian Roos] empty PeriodIndex issues --- doc/source/whatsnew/v0.18.2.txt | 6 ++ pandas/core/groupby.py | 39 ++++---- pandas/tests/indexing/test_indexing.py | 3 +- pandas/tests/test_groupby.py | 6 +- pandas/tseries/period.py | 17 ++-- pandas/tseries/resample.py | 20 ++-- pandas/tseries/tests/test_period.py | 34 ++++++- pandas/tseries/tests/test_resample.py | 132 +++++++++++++++---------- 8 files changed, 168 insertions(+), 89 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 5b4a4981ab8ad..111db67466c05 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -156,6 +156,9 @@ Bug Fixes - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) +- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) @@ -175,4 +178,7 @@ Bug Fixes - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) + + + - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bc02d8c49f3ae..bea62e98e4a2a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -37,7 +37,7 @@ is_datetime_or_timedelta_dtype, is_bool, is_bool_dtype, AbstractMethodError, _maybe_fill) -from pandas.core.config import option_context +from pandas.core.config import option_context, is_callable import pandas.lib as lib from pandas.lib import Timestamp import pandas.tslib as tslib @@ -643,9 +643,20 @@ def apply(self, func, *args, **kwargs): func = self._is_builtin_func(func) - @wraps(func) - def f(g): - return func(g, *args, **kwargs) + # this is needed so we don't try and wrap strings. If we could + # resolve functions to their callable functions prior, this + # wouldn't be needed + if args or kwargs: + if is_callable(func): + + @wraps(func) + def f(g): + return func(g, *args, **kwargs) + else: + raise ValueError('func must be a callable if args or ' + 'kwargs are supplied') + else: + f = func # ignore SettingWithCopy here in case the user mutates with option_context('mode.chained_assignment', None): @@ -2675,7 +2686,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self.name) + return Series([], name=self.name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -3222,8 +3233,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): from pandas.core.index import _all_indexes_same if len(keys) == 0: - # XXX - return DataFrame({}) + return DataFrame(index=keys) key_names = self.grouper.names @@ -3646,17 +3656,12 @@ def _gotitem(self, key, ndim, subset=None): def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] - if result: - if self.axis == 0: - result = DataFrame(result, index=obj.columns, - columns=result_index).T - else: - result = DataFrame(result, index=obj.index, - columns=result_index) + if self.axis == 0: + return DataFrame(result, index=obj.columns, + columns=result_index).T else: - result = DataFrame(result) - - return result + return DataFrame(result, index=obj.index, + columns=result_index) def _get_data_to_aggregate(self): obj = self._obj_with_exclusions diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index fdc9d3599e8ac..708006a9dc21b 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -4250,7 +4250,8 @@ def test_series_partial_set_period(self): pd.Period('2011-01-03', freq='D')] exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + result = ser.loc[keys] + assert_series_equal(result, exp) def test_partial_set_invalid(self): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9cb070c0cd926..5dfe88d04309e 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -775,11 +775,11 @@ def test_agg_apply_corner(self): # DataFrame grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, - index=pd.Index( - [], dtype=np.float64)) + index=pd.Index([], dtype=np.float64)) assert_frame_equal(grouped.sum(), exp_df, check_names=False) assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), DataFrame({}, dtype=float)) + assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) def test_agg_grouping_is_list_tuple(self): from pandas.core.groupby import Grouping diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index fb91185746181..b690bc23c2496 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -17,9 +17,9 @@ from pandas.core.base import _shared_docs import pandas.core.common as com -from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box, - _values_from_object, ABCSeries, - is_integer, is_float, is_object_dtype) +from pandas.core.common import ( + isnull, _INT64_DTYPE, _maybe_box, _values_from_object, ABCSeries, + is_integer, is_float) from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution @@ -271,10 +271,15 @@ def _from_arraylike(cls, data, freq, tz): @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not getattr(values, 'dtype', None): + + if not com.is_integer_dtype(values): values = np.array(values, copy=False) - if is_object_dtype(values): - return PeriodIndex(values, name=name, freq=freq, **kwargs) + if (len(values) > 0 and com.is_float_dtype(values)): + raise TypeError("PeriodIndex can't take floats") + else: + return PeriodIndex(values, name=name, freq=freq, **kwargs) + + values = np.array(values, dtype='int64', copy=False) result = object.__new__(cls) result._data = values diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index ac30db35c0f85..90ec5d19db590 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -21,6 +21,7 @@ from pandas.compat.numpy import function as nv from pandas.lib import Timestamp +from pandas._period import IncompatibleFrequency import pandas.lib as lib import pandas.tslib as tslib @@ -795,16 +796,17 @@ def _downsample(self, how, **kwargs): ax = self.ax new_index = self._get_new_index() - if len(new_index) == 0: - return self._wrap_result(self._selected_obj.reindex(new_index)) # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) if is_subperiod(ax.freq, self.freq): # Downsampling - rng = np.arange(memb.values[0], memb.values[-1] + 1) - bins = memb.searchsorted(rng, side='right') + if len(new_index) == 0: + bins = [] + else: + rng = np.arange(memb.values[0], memb.values[-1] + 1) + bins = memb.searchsorted(rng, side='right') grouper = BinGrouper(bins, new_index) return self._groupby_and_aggregate(how, grouper=grouper) elif is_superperiod(ax.freq, self.freq): @@ -812,10 +814,9 @@ def _downsample(self, how, **kwargs): elif ax.freq == self.freq: return self.asfreq() - raise ValueError('Frequency {axfreq} cannot be ' - 'resampled to {freq}'.format( - axfreq=ax.freq, - freq=self.freq)) + raise IncompatibleFrequency( + 'Frequency {} cannot be resampled to {}, as they are not ' + 'sub or super periods'.format(ax.freq, self.freq)) def _upsample(self, method, limit=None): """ @@ -838,9 +839,6 @@ def _upsample(self, method, limit=None): obj = self.obj new_index = self._get_new_index() - if len(new_index) == 0: - return self._wrap_result(self._selected_obj.reindex(new_index)) - # Start vs. end of period memb = ax.asfreq(self.freq, how=self.convention) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index db1572a49a9ff..c5aae1f8ecebb 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1742,13 +1742,45 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') def test_constructor_simple_new(self): - idx = period_range('2007-01', name='p', periods=20, freq='M') + idx = period_range('2007-01', name='p', periods=2, freq='M') result = idx._simple_new(idx, 'p', freq=idx.freq) self.assertTrue(result.equals(idx)) result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) self.assertTrue(result.equals(idx)) + result = idx._simple_new( + [pd.Period('2007-01', freq='M'), pd.Period('2007-02', freq='M')], + 'p', freq=idx.freq) + self.assertTrue(result.equals(idx)) + + result = idx._simple_new( + np.array([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')]), + 'p', freq=idx.freq) + self.assertTrue(result.equals(idx)) + + def test_constructor_simple_new_empty(self): + # GH13079 + idx = PeriodIndex([], freq='M', name='p') + result = idx._simple_new(idx, name='p', freq='M') + assert_index_equal(result, idx) + + def test_constructor_simple_new_floats(self): + # GH13079 + for floats in [[1.1], np.array([1.1])]: + with self.assertRaises(TypeError): + pd.PeriodIndex._simple_new(floats, freq='M') + + def test_shallow_copy_empty(self): + + # GH13067 + idx = PeriodIndex([], freq='M') + result = idx._shallow_copy() + expected = idx + + assert_index_equal(result, expected) + def test_constructor_nat(self): self.assertRaises(ValueError, period_range, start='NaT', end='2011-01-01', freq='M') diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 27b15a412ae37..37b16684643be 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -17,15 +17,16 @@ UnsupportedFunctionCall) from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS +from pandas.tseries.frequencies import to_offset from pandas.tseries.index import date_range from pandas.tseries.offsets import Minute, BDay from pandas.tseries.period import period_range, PeriodIndex, Period from pandas.tseries.resample import (DatetimeIndex, TimeGrouper, DatetimeIndexResampler) -from pandas.tseries.frequencies import to_offset from pandas.tseries.tdi import timedelta_range from pandas.util.testing import (assert_series_equal, assert_almost_equal, - assert_frame_equal) + assert_frame_equal, assert_index_equal) +from pandas._period import IncompatibleFrequency bday = BDay() @@ -578,6 +579,7 @@ class Base(object): base class for resampling testing, calling .create_series() generates a series of each index type """ + def create_index(self, *args, **kwargs): """ return the _index_factory created using the args, kwargs """ factory = self._index_factory() @@ -620,6 +622,76 @@ def test_resample_interpolate(self): df.resample('1T').asfreq().interpolate(), df.resample('1T').interpolate()) + def test_raises_on_non_datetimelike_index(self): + # this is a non datetimelike index + xp = DataFrame() + self.assertRaises(TypeError, lambda: xp.resample('A').mean()) + + def test_resample_empty_series(self): + # GH12771 & GH12868 + + s = self.create_series()[:0] + + for freq in ['M', 'D', 'H']: + # need to test for ohlc from GH13083 + methods = [method for method in resample_methods + if method != 'ohlc'] + for method in methods: + expected_index = s.index._shallow_copy(freq=freq) + + result = getattr(s.resample(freq), method)() + expected = s + assert_index_equal(result.index, expected_index) + # freq equality not yet checked in assert_index_equal + self.assertEqual(result.index.freq, expected_index.freq) + if (method == 'size' and + isinstance(result.index, PeriodIndex) and + freq in ['M', 'D']): + # GH12871 - TODO: name should propagate, but currently + # doesn't on lower / same frequency with PeriodIndex + assert_series_equal(result, expected, check_dtype=False, + check_names=False) + # this assert will break when fixed + self.assertTrue(result.name is None) + else: + assert_series_equal(result, expected, check_dtype=False) + + def test_resample_empty_dataframe(self): + # GH13212 + index = self.create_series().index[:0] + f = DataFrame(index=index) + + for freq in ['M', 'D', 'H']: + # count retains dimensions too + methods = downsample_methods + ['count'] + for method in methods: + expected_index = f.index._shallow_copy(freq=freq) + result = getattr(f.resample(freq), method)() + expected = f + assert_index_equal(result.index, expected_index) + # freq equality not yet checked in assert_index_equal + # TODO: remove when freq checked + self.assertEqual(result.index.freq, expected_index.freq) + assert_frame_equal(result, expected, check_dtype=False) + + # test size for GH13212 (currently stays as df) + + def test_resample_empty_dtypes(self): + + # Empty series were sometimes causing a segfault (for the functions + # with Cython bounds-checking disabled) or an IndexError. We just run + # them to ensure they no longer do. (GH #10228) + for index in tm.all_timeseries_index_generator(0): + for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): + for how in downsample_methods + upsample_methods: + empty_series = pd.Series([], index, dtype) + try: + getattr(empty_series.resample('d'), how)() + except DataError: + # Ignore these since some combinations are invalid + # (ex: doing mean with dtype of np.object) + pass + class TestDatetimeIndex(Base, tm.TestCase): _multiprocess_can_split_ = True @@ -1408,39 +1480,6 @@ def test_period_with_agg(self): result = s2.resample('D').agg(lambda x: x.mean()) assert_series_equal(result, expected) - def test_resample_empty(self): - ts = _simple_ts('1/1/2000', '2/1/2000')[:0] - - result = ts.resample('A').mean() - self.assertEqual(len(result), 0) - self.assertEqual(result.index.freqstr, 'A-DEC') - - result = ts.resample('A', kind='period').mean() - self.assertEqual(len(result), 0) - self.assertEqual(result.index.freqstr, 'A-DEC') - - # this is a non datetimelike index - xp = DataFrame() - self.assertRaises(TypeError, lambda: xp.resample('A').mean()) - - # Empty series were sometimes causing a segfault (for the functions - # with Cython bounds-checking disabled) or an IndexError. We just run - # them to ensure they no longer do. (GH #10228) - for index in tm.all_timeseries_index_generator(0): - for dtype in (np.float, np.int, np.object, 'datetime64[ns]'): - for how in downsample_methods + upsample_methods: - empty_series = pd.Series([], index, dtype) - try: - getattr(empty_series.resample('d'), how)() - except DataError: - # Ignore these since some combinations are invalid - # (ex: doing mean with dtype of np.object) - pass - - # this should also tests nunique - # (IOW, use resample_methods) - # when GH12886 is closed - def test_resample_segfault(self): # GH 8573 # segfaulting in older versions @@ -2085,19 +2124,6 @@ def test_resample_basic(self): result2 = s.resample('T', kind='period').mean() assert_series_equal(result2, expected) - def test_resample_empty(self): - - # GH12771 & GH12868 - index = PeriodIndex(start='2000', periods=0, freq='D', name='idx') - s = Series(index=index) - - expected_index = PeriodIndex([], name='idx', freq='M') - expected = Series(index=expected_index) - - for method in resample_methods: - result = getattr(s.resample('M'), method)() - assert_series_equal(result, expected) - def test_resample_count(self): # GH12774 @@ -2121,6 +2147,12 @@ def test_resample_same_freq(self): result = getattr(series.resample('M'), method)() assert_series_equal(result, expected) + def test_resample_incompat_freq(self): + + with self.assertRaises(IncompatibleFrequency): + pd.Series(range(3), index=pd.period_range( + start='2000', periods=3, freq='M')).resample('W').mean() + def test_with_local_timezone_pytz(self): # GH5430 tm._skip_if_no_pytz() @@ -2482,7 +2514,6 @@ def create_series(self): return Series(np.arange(len(i)), index=i, name='tdi') def test_asfreq_bug(self): - import datetime as dt df = DataFrame(data=[1, 3], index=[dt.timedelta(), dt.timedelta(minutes=3)]) @@ -2495,7 +2526,6 @@ def test_asfreq_bug(self): class TestResamplerGrouper(tm.TestCase): - def setUp(self): self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, 'B': np.arange(40)}, @@ -2631,11 +2661,13 @@ def test_apply(self): def f(x): return x.resample('2s').sum() + result = r.apply(f) assert_frame_equal(result, expected) def f(x): return x.resample('2s').apply(lambda y: y.sum()) + result = g.apply(f) assert_frame_equal(result, expected) From 6f90340df32711dea72f5701a5af8cd141393874 Mon Sep 17 00:00:00 2001 From: Allen Riddell Date: Sat, 21 May 2016 10:33:40 -0400 Subject: [PATCH 38/96] API: Use np.random's RandomState when seed is None in .sample closes #13143 Author: Allen Riddell Closes #13161 from ariddell/feature/sample-numpy-random-seed and squashes the following commits: 595b0bc [Allen Riddell] Use np.random's RandomState when seed is None --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/core/common.py | 4 ++-- pandas/tests/test_common.py | 2 +- pandas/tests/test_generic.py | 8 ++++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 111db67466c05..a77bdcec2ce7a 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -55,6 +55,8 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) - An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) + .. _whatsnew_0182.api.tolist: diff --git a/pandas/core/common.py b/pandas/core/common.py index 64bfbdde0c5c3..8af6b78a050f3 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2062,7 +2062,7 @@ def _random_state(state=None): state : int, np.random.RandomState, None. If receives an int, passes to np.random.RandomState() as seed. If receives an np.random.RandomState object, just returns object. - If receives `None`, returns an np.random.RandomState object. + If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. Default None. @@ -2076,7 +2076,7 @@ def _random_state(state=None): elif isinstance(state, np.random.RandomState): return state elif state is None: - return np.random.RandomState() + return np.random else: raise ValueError("random_state must be an integer, a numpy " "RandomState, or None") diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 090669681fb4f..e1b186f6c21e5 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -695,7 +695,7 @@ def test_random_state(): com._random_state(state2).uniform(), npr.RandomState(10).uniform()) # check with no arg random state - assert isinstance(com._random_state(), npr.RandomState) + assert com._random_state() is np.random # Error for floats or strings with tm.assertRaises(ValueError): diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index ba282f0107d71..2bad2fabcfc57 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -415,6 +415,14 @@ def test_sample(self): o.sample(frac=0.7, random_state=np.random.RandomState(test)), o.sample(frac=0.7, random_state=np.random.RandomState(test))) + os1, os2 = [], [] + for _ in range(2): + np.random.seed(test) + os1.append(o.sample(n=4)) + os2.append(o.sample(frac=0.7)) + self._compare(*os1) + self._compare(*os2) + # Check for error when random_state argument invalid. with tm.assertRaises(ValueError): o.sample(random_state='astring!') From 82bdc1dc80330330ab34e8a3a8c1e37b7e3a9b43 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 21 May 2016 19:56:42 -0400 Subject: [PATCH 39/96] TST: check internal Categorical - [x] closes #13076 - [x] tests added / passed - [x] passes ``git diff upstream/master | flake8 --diff`` Author: sinhrks Closes #13249 from sinhrks/test_categorical and squashes the following commits: f536644 [sinhrks] TST: check internal Categorical --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/core/reshape.py | 9 +- pandas/io/tests/test_pickle.py | 17 +++ pandas/io/tests/test_pytables.py | 2 +- pandas/io/tests/test_stata.py | 33 ++--- pandas/tests/frame/test_reshape.py | 39 +++--- pandas/tests/indexing/test_categorical.py | 25 ++-- pandas/tests/series/test_apply.py | 3 +- pandas/tests/test_categorical.py | 143 ++++++++++++---------- pandas/tests/test_generic.py | 13 +- pandas/tests/test_groupby.py | 32 +++-- pandas/tests/test_reshape.py | 22 +--- pandas/util/testing.py | 26 +++- 13 files changed, 221 insertions(+), 145 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index a77bdcec2ce7a..4b3c96da10efd 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -180,7 +180,7 @@ Bug Fixes - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) - Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) - Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) - +- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 7e0c094aec4c2..8d237016d1b33 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -162,9 +162,12 @@ def get_result(self): # may need to coerce categoricals here if self.is_categorical is not None: - values = [Categorical.from_array( - values[:, i], categories=self.is_categorical.categories, - ordered=True) for i in range(values.shape[-1])] + categories = self.is_categorical.categories + ordered = self.is_categorical.ordered + values = [Categorical.from_array(values[:, i], + categories=categories, + ordered=ordered) + for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 4ff0363d07df6..7f2813d5281cb 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -108,6 +108,13 @@ def compare_series_dt_tz(self, result, expected, typ, version): else: tm.assert_series_equal(result, expected) + def compare_series_cat(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_series_equal(result, expected, check_categorical=False) + else: + tm.assert_series_equal(result, expected) + def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): # 8260 # dtype is object < 0.17.0 @@ -117,6 +124,16 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) + def compare_frame_cat_onecol(self, result, expected, typ, version): + # Categorical.ordered is changed in < 0.16.0 + if LooseVersion(version) < '0.16.0': + tm.assert_frame_equal(result, expected, check_categorical=False) + else: + tm.assert_frame_equal(result, expected) + + def compare_frame_cat_and_float(self, result, expected, typ, version): + self.compare_frame_cat_onecol(result, expected, typ, version) + def compare_index_period(self, result, expected, typ, version): tm.assert_index_equal(result, expected) tm.assertIsInstance(result.freq, MonthEnd) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6bf0175526424..5ee84ce97979a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1004,7 +1004,7 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = s.replace(nan_rep, np.nan) - assert_series_equal(s_nan, retr) + assert_series_equal(s_nan, retr, check_categorical=False) for s in examples: roundtrip(s) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index fe782bb86d1be..17f74d5789298 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -234,10 +234,11 @@ def test_read_dta4(self): expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) - tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected) - tm.assert_frame_equal(parsed_115, expected) - tm.assert_frame_equal(parsed_117, expected) + # stata doesn't save .category metadata + tm.assert_frame_equal(parsed_113, expected, check_categorical=False) + tm.assert_frame_equal(parsed_114, expected, check_categorical=False) + tm.assert_frame_equal(parsed_115, expected, check_categorical=False) + tm.assert_frame_equal(parsed_117, expected, check_categorical=False) # File containing strls def test_read_dta12(self): @@ -872,8 +873,8 @@ def test_categorical_writing(self): # Silence warnings original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), expected) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, expected, check_categorical=False) def test_categorical_warnings_and_errors(self): # Warning for non-string labels @@ -915,8 +916,8 @@ def test_categorical_with_stata_missing_values(self): with tm.ensure_clean() as path: original.to_stata(path) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal( - written_and_read_again.set_index('index'), original) + res = written_and_read_again.set_index('index') + tm.assert_frame_equal(res, original, check_categorical=False) def test_categorical_order(self): # Directly construct using expected codes @@ -945,8 +946,8 @@ def test_categorical_order(self): # Read with and with out categoricals, ensure order is identical parsed_115 = read_stata(self.dta19_115) parsed_117 = read_stata(self.dta19_117) - tm.assert_frame_equal(expected, parsed_115) - tm.assert_frame_equal(expected, parsed_117) + tm.assert_frame_equal(expected, parsed_115, check_categorical=False) + tm.assert_frame_equal(expected, parsed_117, check_categorical=False) # Check identity of codes for col in expected: @@ -969,8 +970,10 @@ def test_categorical_sorting(self): categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] cat = pd.Categorical.from_codes(codes=codes, categories=categories) expected = pd.Series(cat, name='srh') - tm.assert_series_equal(expected, parsed_115["srh"]) - tm.assert_series_equal(expected, parsed_117["srh"]) + tm.assert_series_equal(expected, parsed_115["srh"], + check_categorical=False) + tm.assert_series_equal(expected, parsed_117["srh"], + check_categorical=False) def test_categorical_ordering(self): parsed_115 = read_stata(self.dta19_115) @@ -1021,7 +1024,8 @@ def test_read_chunks_117(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() @@ -1087,7 +1091,8 @@ def test_read_chunks_115(self): from_frame = parsed.iloc[pos:pos + chunksize, :] tm.assert_frame_equal( from_frame, chunk, check_dtype=False, - check_datetimelike_compat=True) + check_datetimelike_compat=True, + check_categorical=False) pos += chunksize itr.close() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index e7d64324e6590..43c288162b134 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -158,6 +158,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z'], dtype=np.float) assert_frame_equal(result, expected) + def test_unstack_fill_frame(self): + # From a dataframe rows = [[1, 2], [3, 4], [5, 6], [7, 8]] df = DataFrame(rows, columns=list('AB'), dtype=np.int32) @@ -190,6 +192,8 @@ def test_unstack_fill(self): [('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')]) assert_frame_equal(result, expected) + def test_unstack_fill_frame_datetime(self): + # Test unstacking with date times dv = pd.date_range('2012-01-01', periods=4).values data = Series(dv) @@ -208,6 +212,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_timedelta(self): + # Test unstacking with time deltas td = [Timedelta(days=i) for i in range(4)] data = Series(td) @@ -226,6 +232,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_period(self): + # Test unstacking with period periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'), Period('2012-04')] @@ -245,6 +253,8 @@ def test_unstack_fill(self): index=['x', 'y', 'z']) assert_frame_equal(result, expected) + def test_unstack_fill_frame_categorical(self): + # Test unstacking with categorical data = pd.Series(['a', 'b', 'c', 'a'], dtype='category') data.index = pd.MultiIndex.from_tuples( @@ -273,27 +283,20 @@ def test_unstack_fill(self): assert_frame_equal(result, expected) def test_stack_ints(self): - df = DataFrame( - np.random.randn(30, 27), - columns=MultiIndex.from_tuples( - list(itertools.product(range(3), repeat=3)) - ) - ) - assert_frame_equal( - df.stack(level=[1, 2]), - df.stack(level=1).stack(level=1) - ) - assert_frame_equal( - df.stack(level=[-2, -1]), - df.stack(level=1).stack(level=1) - ) + columns = MultiIndex.from_tuples(list(itertools.product(range(3), + repeat=3))) + df = DataFrame(np.random.randn(30, 27), columns=columns) + + assert_frame_equal(df.stack(level=[1, 2]), + df.stack(level=1).stack(level=1)) + assert_frame_equal(df.stack(level=[-2, -1]), + df.stack(level=1).stack(level=1)) df_named = df.copy() df_named.columns.set_names(range(3), inplace=True) - assert_frame_equal( - df_named.stack(level=[1, 2]), - df_named.stack(level=1).stack(level=1) - ) + + assert_frame_equal(df_named.stack(level=[1, 2]), + df_named.stack(level=1).stack(level=1)) def test_stack_mixed_levels(self): columns = MultiIndex.from_tuples( diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 53ab9aca03f6c..2cb62a60f885b 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -108,15 +108,17 @@ def test_loc_listlike_dtypes(self): # unique slice res = df.loc[['a', 'b']] - exp = DataFrame({'A': [1, 2], - 'B': [4, 5]}, index=pd.CategoricalIndex(['a', 'b'])) + exp_index = pd.CategoricalIndex(['a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] - exp = DataFrame({'A': [1, 1, 2], - 'B': [4, 4, 5]}, - index=pd.CategoricalIndex(['a', 'a', 'b'])) + + exp_index = pd.CategoricalIndex(['a', 'a', 'b'], + categories=index.categories) + exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assertRaisesRegexp( @@ -194,12 +196,15 @@ def test_ix_categorical_index(self): expect = pd.Series(df.ix[:, 'X'], index=cdf.index, name='X') assert_series_equal(cdf.ix[:, 'X'], expect) + exp_index = pd.CategoricalIndex(list('AB'), categories=['A', 'B', 'C']) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AB'))) + index=exp_index) assert_frame_equal(cdf.ix[['A', 'B'], :], expect) + exp_columns = pd.CategoricalIndex(list('XY'), + categories=['X', 'Y', 'Z']) expect = pd.DataFrame(df.ix[:, ['X', 'Y']], index=cdf.index, - columns=pd.CategoricalIndex(list('XY'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, ['X', 'Y']], expect) # non-unique @@ -209,12 +214,14 @@ def test_ix_categorical_index(self): cdf.index = pd.CategoricalIndex(df.index) cdf.columns = pd.CategoricalIndex(df.columns) + exp_index = pd.CategoricalIndex(list('AA'), categories=['A', 'B']) expect = pd.DataFrame(df.ix['A', :], columns=cdf.columns, - index=pd.CategoricalIndex(list('AA'))) + index=exp_index) assert_frame_equal(cdf.ix['A', :], expect) + exp_columns = pd.CategoricalIndex(list('XX'), categories=['X', 'Y']) expect = pd.DataFrame(df.ix[:, 'X'], index=cdf.index, - columns=pd.CategoricalIndex(list('XX'))) + columns=exp_columns) assert_frame_equal(cdf.ix[:, 'X'], expect) expect = pd.DataFrame(df.ix[['A', 'B'], :], columns=cdf.columns, diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 6e0a0175b403f..9cb1e9dd93d16 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -187,7 +187,8 @@ def test_map(self): index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - exp = Series([np.nan, 'B', 'C', 'D'], dtype='category') + exp = Series(pd.Categorical([np.nan, 'B', 'C', 'D'], + categories=['B', 'C', 'D', 'E'])) self.assert_series_equal(a.map(b), exp) exp = Series([np.nan, 'B', 'C', 'D']) self.assert_series_equal(a.map(c), exp) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5a6667e57ce9d..40ef5354e91bd 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -556,28 +556,35 @@ def test_categories_none(self): def test_describe(self): # string type desc = self.factor.describe() + self.assertTrue(self.factor.ordered) + exp_index = pd.CategoricalIndex(['a', 'b', 'c'], name='categories', + ordered=self.factor.ordered) expected = DataFrame({'counts': [3, 2, 3], 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, - index=pd.CategoricalIndex(['a', 'b', 'c'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() + + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], + ordered=self.factor.ordered, + name='categories') expected = DataFrame({'counts': [3, 2, 3, 0], 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, - index=pd.CategoricalIndex(['a', 'b', 'c', 'd'], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # check an integer one - desc = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]).describe() + cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) + desc = cat.describe() + exp_index = pd.CategoricalIndex([1, 2, 3], ordered=cat.ordered, + name='categories') expected = DataFrame({'counts': [5, 3, 3], 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, - index=pd.CategoricalIndex([1, 2, 3], - name='categories')) + index=exp_index) tm.assert_frame_equal(desc, expected) # https://github.com/pydata/pandas/issues/3678 @@ -601,7 +608,7 @@ def test_describe(self): columns=['counts', 'freqs'], index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], name='categories')) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) # NA as an unused category with tm.assert_produces_warning(FutureWarning): @@ -613,7 +620,7 @@ def test_describe(self): ['b', 'a', 'c', np.nan], name='categories') expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], columns=['counts', 'freqs'], index=exp_idx) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_categorical=False) def test_print(self): expected = ["[a, b, b, a, a, c, c, c]", @@ -2885,13 +2892,17 @@ def test_value_counts(self): categories=["c", "a", "b", "d"]) s = pd.Series(cats, name='xxx') res = s.value_counts(sort=False) - exp = Series([3, 1, 2, 0], name='xxx', - index=pd.CategoricalIndex(["c", "a", "b", "d"])) + + exp_index = pd.CategoricalIndex(["c", "a", "b", "d"], + categories=cats.categories) + exp = Series([3, 1, 2, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) res = s.value_counts(sort=True) - exp = Series([3, 2, 1, 0], name='xxx', - index=pd.CategoricalIndex(["c", "b", "a", "d"])) + + exp_index = pd.CategoricalIndex(["c", "b", "a", "d"], + categories=cats.categories) + exp = Series([3, 2, 1, 0], name='xxx', index=exp_index) tm.assert_series_equal(res, exp) # check object dtype handles the Series.name as the same @@ -2927,38 +2938,39 @@ def test_value_counts_with_nan(self): index=pd.CategoricalIndex(["a", "b", np.nan]))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", "a"], categories=["a", "b", np.nan])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + s = pd.Series(pd.Categorical(["a", "b", "a"], + categories=["a", "b", np.nan])) + + # internal categories are different because of NaN + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([2, 1, 0], + index=pd.CategoricalIndex(["a", "b", np.nan])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical( - ["a", "b", None, "a", None, None], categories=["a", "b", np.nan - ])) - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"]))) + s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b", np.nan])) + + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=True), exp, + check_categorical=False) + exp = pd.Series([3, 2, 1], + index=pd.CategoricalIndex([np.nan, "a", "b"])) + tm.assert_series_equal(s.value_counts(dropna=False), exp, + check_categorical=False) def test_groupby(self): - cats = Categorical( - ["a", "a", "a", "b", "b", "b", "c", "c", "c" - ], categories=["a", "b", "c", "d"], ordered=True) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - expected = DataFrame({'a': Series( - [1, 2, 4, np.nan], index=pd.CategoricalIndex( - ['a', 'b', 'c', 'd'], name='b'))}) + exp_index = pd.CategoricalIndex(['a', 'b', 'c', 'd'], name='b', + ordered=True) + expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) @@ -2970,17 +2982,19 @@ def test_groupby(self): # single grouper gb = df.groupby("A") - exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A') + exp_idx = pd.CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, np.nan], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # multiple groupers gb = df.groupby(['A', 'B']) - expected = DataFrame({'values': Series( - [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan - ], index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], names=['A', 'B']))}) + exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], + ['c', 'd', 'y']], + names=['A', 'B']) + expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, + np.nan, np.nan, np.nan]}, + index=exp_index) result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3054,8 +3068,10 @@ def f(x): df = pd.DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4]) result = df.groupby(c).apply(len) - expected = pd.Series([1, 0, 0, 0], - index=pd.CategoricalIndex(c.values.categories)) + + exp_index = pd.CategoricalIndex(c.values.categories, + ordered=c.values.ordered) + expected = pd.Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) @@ -3369,30 +3385,28 @@ def test_assigning_ops(self): # assign a part of a column with dtype != categorical -> # exp_parts_cats_col - cats = pd.Categorical( - ["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], + categories=["a", "b"]) idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 1, 1, 1, 1, 1, 1] orig = pd.DataFrame({"cats": cats, "values": values}, index=idx) # the expected values # changed single row - cats1 = pd.Categorical( - ["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) + cats1 = pd.Categorical(["a", "a", "b", "a", "a", "a", "a"], + categories=["a", "b"]) idx1 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values1 = [1, 1, 2, 1, 1, 1, 1] - exp_single_row = pd.DataFrame( - {"cats": cats1, - "values": values1}, index=idx1) + exp_single_row = pd.DataFrame({"cats": cats1, + "values": values1}, index=idx1) # changed multiple rows - cats2 = pd.Categorical( - ["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) + cats2 = pd.Categorical(["a", "a", "b", "b", "a", "a", "a"], + categories=["a", "b"]) idx2 = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) values2 = [1, 1, 2, 2, 1, 1, 1] - exp_multi_row = pd.DataFrame( - {"cats": cats2, - "values": values2}, index=idx2) + exp_multi_row = pd.DataFrame({"cats": cats2, + "values": values2}, index=idx2) # changed part of the cats column cats3 = pd.Categorical( @@ -3653,7 +3667,8 @@ def f(): exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) df[df["cats"] == "c"] = ["b", 2] - tm.assert_frame_equal(df, exp_multi_row) + # category c is kept in .categories + tm.assert_frame_equal(df, exp_fancy) # set_value df = orig.copy() @@ -3708,7 +3723,7 @@ def f(): # ensure that one can set something to np.nan s = Series(Categorical([1, 2, 3])) - exp = Series(Categorical([1, np.nan, 3])) + exp = Series(Categorical([1, np.nan, 3], categories=[1, 2, 3])) s[1] = np.nan tm.assert_series_equal(s, exp) @@ -4083,10 +4098,12 @@ def f(): c = Categorical(["a", "b", np.nan]) with tm.assert_produces_warning(FutureWarning): c.set_categories(["a", "b", np.nan], rename=True, inplace=True) + c[0] = np.nan df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - df_exp = pd.DataFrame({"cats": Categorical(["a", "b", "a"]), - "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp) @@ -4128,7 +4145,9 @@ def cmp(a, b): ]: result = valid(s) - tm.assert_series_equal(result, s) + # compare series values + # internal .categories can't be compared because it is sorted + tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) for invalid in [lambda x: x.astype(pd.Categorical), diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2bad2fabcfc57..794b5e8aa5650 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -847,7 +847,7 @@ def test_to_xarray(self): assert_almost_equal(list(result.coords.keys()), ['foo']) self.assertIsInstance(result, DataArray) - def testit(index, check_index_type=True): + def testit(index, check_index_type=True, check_categorical=True): s = Series(range(6), index=index(6)) s.index.name = 'foo' result = s.to_xarray() @@ -859,7 +859,8 @@ def testit(index, check_index_type=True): # idempotency assert_series_equal(result.to_series(), s, - check_index_type=check_index_type) + check_index_type=check_index_type, + check_categorical=check_categorical) for index in [tm.makeFloatIndex, tm.makeIntIndex, tm.makeStringIndex, tm.makeUnicodeIndex, @@ -868,7 +869,8 @@ def testit(index, check_index_type=True): testit(index) # not idempotent - testit(tm.makeCategoricalIndex, check_index_type=False) + testit(tm.makeCategoricalIndex, check_index_type=False, + check_categorical=False) s = Series(range(6)) s.index.name = 'foo' @@ -1409,9 +1411,8 @@ def test_to_xarray(self): expected['f'] = expected['f'].astype(object) expected['h'] = expected['h'].astype('datetime64[ns]') expected.columns.name = None - assert_frame_equal(result.to_dataframe(), - expected, - check_index_type=False) + assert_frame_equal(result.to_dataframe(), expected, + check_index_type=False, check_categorical=False) # available in 0.7.1 # MultiIndex diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 5dfe88d04309e..38e6a066d3eea 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -3868,8 +3868,8 @@ def test_groupby_sort_categorical(self): ['(0, 2.5]', 1, 60], ['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar']) df['range'] = Categorical(df['range'], ordered=True) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range', ordered=True) result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) @@ -3879,13 +3879,15 @@ def test_groupby_sort_categorical(self): assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) df['range'] = Categorical(df['range'], ordered=False) - index = CategoricalIndex( - ['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', '(7.5, 10]'], name='range') + index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]', + '(7.5, 10]'], name='range') result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'], index=index) - index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', - '(5, 7.5]', '(0, 2.5]'], + index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]', + '(0, 2.5]'], + categories=['(7.5, 10]', '(2.5, 5]', + '(5, 7.5]', '(0, 2.5]'], name='range') result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], index=index, columns=['foo', 'bar']) @@ -3975,7 +3977,8 @@ def test_groupby_categorical(self): result = data.groupby(cats).mean() expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, ordered=True) + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -3986,14 +3989,16 @@ def test_groupby_categorical(self): idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) - expected = ord_data.groupby( - Categorical(ord_labels), sort=False).describe() + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() expected.index.names = [None, None] assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) exp = CategoricalIndex(expc) self.assert_index_equal(desc_result.index.get_level_values(0), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', @@ -6266,8 +6271,11 @@ def test_groupby_categorical_two_columns(self): # Grouping on a single column groups_single_key = test.groupby("cat") res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) + index=exp_index) tm.assert_frame_equal(res, exp) # Grouping on two columns diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 862e2282bae2f..7136d7effc1fc 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -239,26 +239,16 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, - 1: 0.0, - 2: 1.0}, - 'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}).reindex_axis( - ['a', 'b', nan], 1) + exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, + 'a': {0: 1.0, 1: 0.0, 2: 0.0}, + 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 8682302b542be..0ec2c96dbbd7d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -25,7 +25,7 @@ from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_datetimelike_v_numeric, is_datetimelike_v_object, is_number, - needs_i8_conversion) + needs_i8_conversion, is_categorical_dtype) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -657,7 +657,7 @@ def assert_equal(a, b, msg=""): def assert_index_equal(left, right, exact='equiv', check_names=True, check_less_precise=False, check_exact=True, - obj='Index'): + check_categorical=True, obj='Index'): """Check that left and right Index are equal. Parameters @@ -675,6 +675,8 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, 5 digits (False) or 3 digits (True) after decimal points are compared. check_exact : bool, default True Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message @@ -752,6 +754,11 @@ def _get_ilevel_values(index, level): if check_names: assert_attr_equal('names', left, right, obj=obj) + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) + def assert_class_equal(left, right, exact=True, obj='Input'): """checks classes are equal.""" @@ -999,6 +1006,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=True, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, obj='Series'): """Check that left and right Series are equal. @@ -1023,6 +1031,8 @@ def assert_series_equal(left, right, check_dtype=True, Whether to check the Series and Index names attribute. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message @@ -1049,6 +1059,7 @@ def assert_series_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) if check_dtype: @@ -1085,6 +1096,11 @@ def assert_series_equal(left, right, check_dtype=True, if check_names: assert_attr_equal('name', left, right, obj=obj) + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, + obj='{0} category'.format(obj)) + # This could be refactored to use the NDFrame.equals method def assert_frame_equal(left, right, check_dtype=True, @@ -1096,6 +1112,7 @@ def assert_frame_equal(left, right, check_dtype=True, by_blocks=False, check_exact=False, check_datetimelike_compat=False, + check_categorical=True, check_like=False, obj='DataFrame'): @@ -1127,6 +1144,8 @@ def assert_frame_equal(left, right, check_dtype=True, Whether to compare number exactly. check_dateteimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. check_like : bool, default False If true, then reindex_like operands obj : str, default 'DataFrame' @@ -1168,6 +1187,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.index'.format(obj)) # column comparison @@ -1175,6 +1195,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_names=check_names, check_less_precise=check_less_precise, check_exact=check_exact, + check_categorical=check_categorical, obj='{0}.columns'.format(obj)) # compare by blocks @@ -1199,6 +1220,7 @@ def assert_frame_equal(left, right, check_dtype=True, check_less_precise=check_less_precise, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, obj='DataFrame.iloc[:, {0}]'.format(i)) From b88eb35ad98ac7a99451b505acc74e5d0e3a81b1 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 21 May 2016 20:00:55 -0400 Subject: [PATCH 40/96] TST/ERR: Add Period ops tests / fix error message xref to #13242 closes #13251 Author: sinhrks Closes #13250 from sinhrks/period_test and squashes the following commits: 33a04f3 [sinhrks] TST/ERR: Add Period ops tests / fix error message --- pandas/src/period.pyx | 9 +- pandas/tseries/tests/test_period.py | 426 ++++++++++++++++---------- pandas/tseries/tests/test_resample.py | 23 +- pandas/util/testing.py | 2 + 4 files changed, 276 insertions(+), 184 deletions(-) diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 670fe1e4f168c..858aa58df8d7d 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -799,8 +799,8 @@ cdef class Period(object): else: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - msg = 'Input cannnot be converted to Period(freq={0})' - raise ValueError(msg) + msg = 'Input cannot be converted to Period(freq={0})' + raise IncompatibleFrequency(msg.format(self.freqstr)) elif isinstance(other, offsets.DateOffset): freqstr = frequencies.get_standard_freq(other) base = frequencies.get_base_alias(freqstr) @@ -849,8 +849,8 @@ cdef class Period(object): return Period(ordinal=ordinal, freq=self.freq) elif isinstance(other, Period): if other.freq != self.freq: - raise ValueError("Cannot do arithmetic with " - "non-conforming periods") + msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) + raise IncompatibleFrequency(msg) if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: return Period(ordinal=tslib.iNaT, freq=self.freq) return self.ordinal - other.ordinal @@ -865,7 +865,6 @@ cdef class Period(object): else: return NotImplemented - def asfreq(self, freq, how='E'): """ Convert Period to desired frequency, either at the start or end of the diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index c5aae1f8ecebb..8ebdcc7acff2d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -492,8 +492,8 @@ def test_sub_delta(self): result = left - right self.assertEqual(result, 4) - self.assertRaises(ValueError, left.__sub__, - Period('2007-01', freq='M')) + with self.assertRaises(period.IncompatibleFrequency): + left - Period('2007-01', freq='M') def test_to_timestamp(self): p = Period('1982', freq='A') @@ -829,9 +829,13 @@ def test_asfreq_MS(self): self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) - self.assertRaises(ValueError, initial.asfreq, freq="MS", how="S") - tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS", pd.Period, - '2013-01', 'MS') + + with self.assertRaisesRegexp(ValueError, "Unknown freqstr"): + initial.asfreq(freq="MS", how="S") + + with tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS"): + pd.Period('2013-01', 'MS') + self.assertTrue(_period_code_map.get("MS") is None) @@ -1638,7 +1642,7 @@ def test_constructor_use_start_freq(self): p = Period('4/2/2012', freq='B') index = PeriodIndex(start=p, periods=10) expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) def test_constructor_field_arrays(self): # GH #1264 @@ -1648,13 +1652,13 @@ def test_constructor_field_arrays(self): index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) index2 = PeriodIndex(year=years, quarter=quarters, freq='2Q-DEC') tm.assert_numpy_array_equal(index.asi8, index2.asi8) index = PeriodIndex(year=years, quarter=quarters) - self.assertTrue(index.equals(expected)) + tm.assert_index_equal(index, expected) years = [2007, 2007, 2007] months = [1, 2] @@ -1669,7 +1673,7 @@ def test_constructor_field_arrays(self): months = [1, 2, 3] idx = PeriodIndex(year=years, month=months, freq='M') exp = period_range('2007-01', periods=3, freq='M') - self.assertTrue(idx.equals(exp)) + tm.assert_index_equal(idx, exp) def test_constructor_U(self): # U was used as undefined period @@ -1700,7 +1704,7 @@ def test_constructor_corner(self): result = period_range('2007-01', periods=10.5, freq='M') exp = period_range('2007-01', periods=10, freq='M') - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_constructor_fromarraylike(self): idx = period_range('2007-01', periods=20, freq='M') @@ -1711,29 +1715,29 @@ def test_constructor_fromarraylike(self): data=Period('2007', freq='A')) result = PeriodIndex(iter(idx)) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq='M') - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = PeriodIndex(idx, freq=offsets.MonthEnd()) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) self.assertTrue(result.freq, 'M') result = PeriodIndex(idx, freq='2M') - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx.asfreq('2M')) self.assertTrue(result.freq, '2M') result = PeriodIndex(idx, freq=offsets.MonthEnd(2)) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx.asfreq('2M')) self.assertTrue(result.freq, '2M') result = PeriodIndex(idx, freq='D') exp = idx.asfreq('D', 'e') - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_constructor_datetime64arr(self): vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) @@ -1744,10 +1748,10 @@ def test_constructor_datetime64arr(self): def test_constructor_simple_new(self): idx = period_range('2007-01', name='p', periods=2, freq='M') result = idx._simple_new(idx, 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + tm.assert_index_equal(result, idx) result = idx._simple_new( [pd.Period('2007-01', freq='M'), pd.Period('2007-02', freq='M')], @@ -1801,14 +1805,14 @@ def test_constructor_freq_mult(self): for func in [PeriodIndex, period_range]: # must be the same, but for sure... pidx = func(start='2014-01', freq='2M', periods=4) - expected = PeriodIndex( - ['2014-01', '2014-03', '2014-05', '2014-07'], freq='M') + expected = PeriodIndex(['2014-01', '2014-03', + '2014-05', '2014-07'], freq='2M') tm.assert_index_equal(pidx, expected) pidx = func(start='2014-01-02', end='2014-01-15', freq='3D') expected = PeriodIndex(['2014-01-02', '2014-01-05', '2014-01-08', '2014-01-11', - '2014-01-14'], freq='D') + '2014-01-14'], freq='3D') tm.assert_index_equal(pidx, expected) pidx = func(end='2014-01-01 17:00', freq='4H', periods=3) @@ -1837,7 +1841,7 @@ def test_constructor_freq_mult_dti_compat(self): freqstr = str(mult) + freq pidx = PeriodIndex(start='2014-04-01', freq=freqstr, periods=10) expected = date_range(start='2014-04-01', freq=freqstr, - periods=10).to_period(freq) + periods=10).to_period(freqstr) tm.assert_index_equal(pidx, expected) def test_is_(self): @@ -1965,11 +1969,11 @@ def test_sub(self): result = rng - 5 exp = rng + (-5) - self.assertTrue(result.equals(exp)) + tm.assert_index_equal(result, exp) def test_periods_number_check(self): - self.assertRaises(ValueError, period_range, '2011-1-1', '2012-1-1', - 'B') + with tm.assertRaises(ValueError): + period_range('2011-1-1', '2012-1-1', 'B') def test_tolist(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1977,7 +1981,7 @@ def test_tolist(self): [tm.assertIsInstance(x, Period) for x in rs] recon = PeriodIndex(rs) - self.assertTrue(index.equals(recon)) + tm.assert_index_equal(index, recon) def test_to_timestamp(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1985,12 +1989,12 @@ def test_to_timestamp(self): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = series.to_timestamp(how='end') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) self.assertEqual(result.name, 'foo') exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = series.to_timestamp(how='start') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) def _get_with_delta(delta, freq='A-DEC'): return date_range(to_datetime('1/1/2001') + delta, @@ -1999,17 +2003,17 @@ def _get_with_delta(delta, freq='A-DEC'): delta = timedelta(hours=23) result = series.to_timestamp('H', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) result = series.to_timestamp('T', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) result = series.to_timestamp('S', 'end') delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) index = PeriodIndex(freq='H', start='1/1/2001', end='1/2/2001') series = Series(1, index=index, name='foo') @@ -2017,7 +2021,7 @@ def _get_with_delta(delta, freq='A-DEC'): exp_index = date_range('1/1/2001 00:59:59', end='1/2/2001 00:59:59', freq='H') result = series.to_timestamp(how='end') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) self.assertEqual(result.name, 'foo') def test_to_timestamp_quarterly_bug(self): @@ -2028,7 +2032,7 @@ def test_to_timestamp_quarterly_bug(self): stamps = pindex.to_timestamp('D', 'end') expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) - self.assertTrue(stamps.equals(expected)) + tm.assert_index_equal(stamps, expected) def test_to_timestamp_preserve_name(self): index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', @@ -2054,11 +2058,11 @@ def test_to_timestamp_pi_nat(self): result = index.to_timestamp('D') expected = DatetimeIndex([pd.NaT, datetime(2011, 1, 1), datetime(2011, 2, 1)], name='idx') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, 'idx') result2 = result.to_period(freq='M') - self.assertTrue(result2.equals(index)) + tm.assert_index_equal(result2, index) self.assertEqual(result2.name, 'idx') result3 = result.to_period(freq='3M') @@ -2085,12 +2089,12 @@ def test_to_timestamp_pi_mult(self): def test_start_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') - self.assertTrue(index.start_time.equals(expected_index)) + tm.assert_index_equal(index.start_time, expected_index) def test_end_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='M') - self.assertTrue(index.end_time.equals(expected_index)) + tm.assert_index_equal(index.end_time, expected_index) def test_as_frame_columns(self): rng = period_range('1/1/2000', periods=5) @@ -2115,17 +2119,18 @@ def test_indexing(self): self.assertEqual(expected, result) def test_frame_setitem(self): - rng = period_range('1/1/2000', periods=5) - rng.name = 'index' + rng = period_range('1/1/2000', periods=5, name='index') df = DataFrame(randn(5, 3), index=rng) df['Index'] = rng rs = Index(df['Index']) - self.assertTrue(rs.equals(rng)) + tm.assert_index_equal(rs, rng, check_names=False) + self.assertEqual(rs.name, 'Index') + self.assertEqual(rng.name, 'index') rs = df.reset_index().set_index('index') tm.assertIsInstance(rs.index, PeriodIndex) - self.assertTrue(rs.index.equals(rng)) + tm.assert_index_equal(rs.index, rng) def test_period_set_index_reindex(self): # GH 6631 @@ -2134,9 +2139,9 @@ def test_period_set_index_reindex(self): idx2 = period_range('2013', periods=6, freq='A') df = df.set_index(idx1) - self.assertTrue(df.index.equals(idx1)) + tm.assert_index_equal(df.index, idx1) df = df.set_index(idx2) - self.assertTrue(df.index.equals(idx2)) + tm.assert_index_equal(df.index, idx2) def test_frame_to_time_stamp(self): K = 5 @@ -2146,12 +2151,12 @@ def test_frame_to_time_stamp(self): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) assert_almost_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) def _get_with_delta(delta, freq='A-DEC'): return date_range(to_datetime('1/1/2001') + delta, @@ -2160,44 +2165,44 @@ def _get_with_delta(delta, freq='A-DEC'): delta = timedelta(hours=23) result = df.to_timestamp('H', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) delta = timedelta(hours=23, minutes=59) result = df.to_timestamp('T', 'end') exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) result = df.to_timestamp('S', 'end') delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) # columns df = df.T exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end', axis=1) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) assert_almost_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start', axis=1) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23) result = df.to_timestamp('H', 'end', axis=1) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) delta = timedelta(hours=23, minutes=59) result = df.to_timestamp('T', 'end', axis=1) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) result = df.to_timestamp('S', 'end', axis=1) delta = timedelta(hours=23, minutes=59, seconds=59) exp_index = _get_with_delta(delta) - self.assertTrue(result.columns.equals(exp_index)) + tm.assert_index_equal(result.columns, exp_index) # invalid axis assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) @@ -2351,7 +2356,7 @@ def test_shift(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') - self.assertTrue(pi1.shift(0).equals(pi1)) + tm.assert_index_equal(pi1.shift(0), pi1) assert_equal(len(pi1), len(pi2)) assert_equal(pi1.shift(1).values, pi2.values) @@ -2385,25 +2390,25 @@ def test_shift_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(1) - expected = PeriodIndex( - ['2011-02', '2011-03', 'NaT', '2011-05'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2011-03', 'NaT', + '2011-05'], freq='M', name='idx') + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) def test_shift_ndarray(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(np.array([1, 2, 3, 4])) - expected = PeriodIndex( - ['2011-02', '2011-04', 'NaT', '2011-08'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2011-04', 'NaT', + '2011-08'], freq='M', name='idx') + tm.assert_index_equal(result, expected) idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') result = idx.shift(np.array([1, -2, 3, -4])) - expected = PeriodIndex( - ['2011-02', '2010-12', 'NaT', '2010-12'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) + expected = PeriodIndex(['2011-02', '2010-12', 'NaT', + '2010-12'], freq='M', name='idx') + tm.assert_index_equal(result, expected) def test_asfreq(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') @@ -2477,7 +2482,7 @@ def test_asfreq_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M') result = idx.asfreq(freq='Q') expected = PeriodIndex(['2011Q1', '2011Q1', 'NaT', '2011Q2'], freq='Q') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_asfreq_mult_pi(self): pi = PeriodIndex(['2001-01', '2001-02', 'NaT', '2001-03'], freq='2M') @@ -2576,12 +2581,12 @@ def test_asfreq_ts(self): df_result = df.asfreq('D', how='end') exp_index = index.asfreq('D', how='end') self.assertEqual(len(result), len(ts)) - self.assertTrue(result.index.equals(exp_index)) - self.assertTrue(df_result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) + tm.assert_index_equal(df_result.index, exp_index) result = ts.asfreq('D', how='start') self.assertEqual(len(result), len(ts)) - self.assertTrue(result.index.equals(index.asfreq('D', how='start'))) + tm.assert_index_equal(result.index, index.asfreq('D', how='start')) def test_badinput(self): self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') @@ -2783,11 +2788,11 @@ def test_pindex_qaccess(self): def test_period_dt64_round_trip(self): dti = date_range('1/1/2000', '1/7/2002', freq='B') pi = dti.to_period() - self.assertTrue(pi.to_timestamp().equals(dti)) + tm.assert_index_equal(pi.to_timestamp(), dti) dti = date_range('1/1/2000', '1/7/2002', freq='B') pi = dti.to_period(freq='H') - self.assertTrue(pi.to_timestamp().equals(dti)) + tm.assert_index_equal(pi.to_timestamp(), dti) def test_to_period_quarterly(self): # make sure we can make the round trip @@ -2796,7 +2801,7 @@ def test_to_period_quarterly(self): rng = period_range('1989Q3', '1991Q3', freq=freq) stamps = rng.to_timestamp() result = stamps.to_period(freq) - self.assertTrue(rng.equals(result)) + tm.assert_index_equal(rng, result) def test_to_period_quarterlyish(self): offsets = ['BQ', 'QS', 'BQS'] @@ -2841,7 +2846,7 @@ def test_multiples(self): def test_pindex_multiples(self): pi = PeriodIndex(start='1/1/11', end='12/31/11', freq='2M') expected = PeriodIndex(['2011-01', '2011-03', '2011-05', '2011-07', - '2011-09', '2011-11'], freq='M') + '2011-09', '2011-11'], freq='2M') tm.assert_index_equal(pi, expected) self.assertEqual(pi.freq, offsets.MonthEnd(2)) self.assertEqual(pi.freqstr, '2M') @@ -2874,7 +2879,7 @@ def test_take(self): taken2 = index[[5, 6, 8, 12]] for taken in [taken1, taken2]: - self.assertTrue(taken.equals(expected)) + tm.assert_index_equal(taken, expected) tm.assertIsInstance(taken, PeriodIndex) self.assertEqual(taken.freq, index.freq) self.assertEqual(taken.name, expected.name) @@ -2954,7 +2959,7 @@ def test_align_series(self): for kind in ['inner', 'outer', 'left', 'right']: ts.align(ts[::2], join=kind) msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" - with assertRaisesRegexp(ValueError, msg): + with assertRaisesRegexp(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end") def test_align_frame(self): @@ -2973,11 +2978,11 @@ def test_union(self): index = period_range('1/1/2000', '1/20/2000', freq='D') result = index[:-5].union(index[10:]) - self.assertTrue(result.equals(index)) + tm.assert_index_equal(result, index) # not in order result = _permute(index[:-5]).union(_permute(index[10:])) - self.assertTrue(result.equals(index)) + tm.assert_index_equal(result, index) # raise if different frequencies index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -3008,13 +3013,13 @@ def test_intersection(self): index = period_range('1/1/2000', '1/20/2000', freq='D') result = index[:-5].intersection(index[10:]) - self.assertTrue(result.equals(index[10:-5])) + tm.assert_index_equal(result, index[10:-5]) # not in order left = _permute(index[:-5]) right = _permute(index[10:]) result = left.intersection(right).sort_values() - self.assertTrue(result.equals(index[10:-5])) + tm.assert_index_equal(result, index[10:-5]) # raise if different frequencies index = period_range('1/1/2000', '1/20/2000', freq='D') @@ -3045,7 +3050,7 @@ def test_intersection_cases(self): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -3071,7 +3076,7 @@ def test_intersection_cases(self): for (rng, expected) in [(rng2, expected2), (rng3, expected3), (rng4, expected4)]: result = base.intersection(rng) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, 'D') @@ -3151,7 +3156,7 @@ def test_map(self): index = PeriodIndex([2005, 2007, 2009], freq='A') result = index.map(lambda x: x + 1) expected = index + 1 - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) result = index.map(lambda x: x.ordinal) exp = [x.ordinal for x in index] @@ -3252,11 +3257,11 @@ def test_factorize(self): arr, idx = idx1.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) arr, idx = idx1.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -3264,19 +3269,19 @@ def test_factorize(self): exp_arr = np.array([2, 2, 1, 0, 2, 0]) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) exp_arr = np.array([0, 0, 1, 2, 0, 2]) exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + tm.assert_index_equal(idx, exp_idx) def test_recreate_from_data(self): for o in ['M', 'Q', 'A', 'D', 'B', 'T', 'S', 'L', 'U', 'N', 'H']: org = PeriodIndex(start='2001/04/01', freq=o, periods=1) idx = PeriodIndex(org.values, freq=o) - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) def test_combine_first(self): # GH 3367 @@ -3324,7 +3329,6 @@ def _permute(obj): class TestMethods(tm.TestCase): - "Base test class for MaskedArrays." def test_add(self): dt1 = Period(freq='D', year=2008, month=1, day=1) @@ -3356,6 +3360,17 @@ def test_add_raises(self): with tm.assertRaisesRegexp(TypeError, msg): dt1 + dt2 + def test_sub(self): + dt1 = Period('2011-01-01', freq='D') + dt2 = Period('2011-01-15', freq='D') + + self.assertEqual(dt1 - dt2, -14) + self.assertEqual(dt2 - dt1, 14) + + msg = "Input has different freq=M from Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + dt1 - pd.Period('2011-02', freq='M') + def test_add_offset(self): # freq is DateOffset for freq in ['A', '2A', '3A']: @@ -3367,14 +3382,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['M', '2M', '3M']: @@ -3390,14 +3405,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p # freq is Tick @@ -3433,14 +3448,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['H', '2H', '3H']: @@ -3475,14 +3490,14 @@ def test_add_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p def test_add_offset_nat(self): @@ -3496,14 +3511,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['M', '2M', '3M']: @@ -3520,14 +3535,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p # freq is Tick for freq in ['D', '2D', '3D']: @@ -3547,14 +3562,14 @@ def test_add_offset_nat(self): offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p for freq in ['H', '2H', '3H']: @@ -3570,14 +3585,14 @@ def test_add_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p + o if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): o + p def test_sub_pdnat(self): @@ -3599,7 +3614,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['M', '2M', '3M']: @@ -3612,7 +3627,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o # freq is Tick @@ -3634,7 +3649,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['H', '2H', '3H']: @@ -3655,7 +3670,7 @@ def test_sub_offset(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o def test_sub_offset_nat(self): @@ -3668,7 +3683,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['M', '2M', '3M']: @@ -3679,7 +3694,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o # freq is Tick @@ -3693,7 +3708,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o for freq in ['H', '2H', '3H']: @@ -3706,7 +3721,7 @@ def test_sub_offset_nat(self): for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): p - o def test_nat_ops(self): @@ -3715,77 +3730,153 @@ def test_nat_ops(self): self.assertEqual((p + 1).ordinal, tslib.iNaT) self.assertEqual((1 + p).ordinal, tslib.iNaT) self.assertEqual((p - 1).ordinal, tslib.iNaT) - self.assertEqual( - (p - Period('2011-01', freq=freq)).ordinal, tslib.iNaT) - self.assertEqual( - (Period('2011-01', freq=freq) - p).ordinal, tslib.iNaT) + self.assertEqual((p - Period('2011-01', freq=freq)).ordinal, + tslib.iNaT) + self.assertEqual((Period('2011-01', freq=freq) - p).ordinal, + tslib.iNaT) + + def test_period_ops_offset(self): + p = Period('2011-04-01', freq='D') + result = p + offsets.Day() + exp = pd.Period('2011-04-02', freq='D') + self.assertEqual(result, exp) - def test_pi_ops_nat(self): - idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + result = p - offsets.Day(2) + exp = pd.Period('2011-03-30', freq='D') + self.assertEqual(result, exp) + + msg = "Input cannot be converted to Period\(freq=D\)" + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p + offsets.Hour(2) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + p - offsets.Hour(2) + + +class TestPeriodIndexSeriesMethods(tm.TestCase): + """ Test PeriodIndex and Period Series Ops consistency """ + + def _check(self, values, func, expected): + idx = pd.PeriodIndex(values) + result = func(idx) + tm.assert_index_equal(result, pd.PeriodIndex(expected)) + + s = pd.Series(values) + result = func(s) + + exp = pd.Series(expected) + # Period(NaT) != Period(NaT) + + lmask = result.map(lambda x: x.ordinal != tslib.iNaT) + rmask = exp.map(lambda x: x.ordinal != tslib.iNaT) + tm.assert_series_equal(lmask, rmask) + tm.assert_series_equal(result[lmask], exp[rmask]) + + def test_pi_ops(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], freq='M', name='idx') - result = idx + 2 - expected = PeriodIndex( - ['2011-03', '2011-04', 'NaT', '2011-06'], freq='M', name='idx') - self.assertTrue(result.equals(expected)) - result2 = result - 2 - self.assertTrue(result2.equals(idx)) + expected = PeriodIndex(['2011-03', '2011-04', + '2011-05', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + result = idx - Period('2011-01', freq='M') + exp = pd.Index([0, 1, 2, 3], name='idx') + tm.assert_index_equal(result, exp) + + result = Period('2011-01', freq='M') - idx + exp = pd.Index([0, -1, -2, -3], name='idx') + tm.assert_index_equal(result, exp) + + def test_pi_ops_errors(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + s = pd.Series(idx) msg = "unsupported operand type\(s\)" - with tm.assertRaisesRegexp(TypeError, msg): - idx + "str" + for obj in [idx, s]: + for ng in ["str", 1.5]: + with tm.assertRaisesRegexp(TypeError, msg): + obj + ng + + with tm.assertRaises(TypeError): + # error message differs between PY2 and 3 + ng + obj - def test_pi_ops_array(self): + with tm.assertRaisesRegexp(TypeError, msg): + obj - ng + + def test_pi_ops_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') - result = idx + np.array([1, 2, 3, 4]) + expected = PeriodIndex(['2011-03', '2011-04', + 'NaT', '2011-06'], freq='M', name='idx') + self._check(idx, lambda x: x + 2, expected) + self._check(idx, lambda x: 2 + x, expected) + + self._check(idx + 2, lambda x: x - 2, idx) + + def test_pi_ops_array_int(self): + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='M', name='idx') + f = lambda x: x + np.array([1, 2, 3, 4]) exp = PeriodIndex(['2011-02', '2011-04', 'NaT', '2011-08'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = np.add(idx, np.array([4, -1, 1, 2])) + f = lambda x: np.add(x, np.array([4, -1, 1, 2])) exp = PeriodIndex(['2011-05', '2011-01', 'NaT', '2011-06'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = idx - np.array([1, 2, 3, 4]) + f = lambda x: x - np.array([1, 2, 3, 4]) exp = PeriodIndex(['2010-12', '2010-12', 'NaT', '2010-12'], freq='M', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) - result = np.subtract(idx, np.array([3, 2, 3, -2])) + f = lambda x: np.subtract(x, np.array([3, 2, 3, -2])) exp = PeriodIndex(['2010-10', '2010-12', 'NaT', '2011-06'], freq='M', name='idx') - self.assert_index_equal(result, exp) - - # incompatible freq - msg = "Input has different freq from PeriodIndex\(freq=M\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx + np.array([np.timedelta64(1, 'D')] * 4) - - idx = PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', 'NaT', - '2011-01-01 12:00'], freq='H', name='idx') - result = idx + np.array([np.timedelta64(1, 'D')] * 4) - exp = PeriodIndex(['2011-01-02 09:00', '2011-01-02 10:00', 'NaT', - '2011-01-02 12:00'], freq='H', name='idx') - self.assert_index_equal(result, exp) - - result = idx - np.array([np.timedelta64(1, 'h')] * 4) - exp = PeriodIndex(['2011-01-01 08:00', '2011-01-01 09:00', 'NaT', - '2011-01-01 11:00'], freq='H', name='idx') - self.assert_index_equal(result, exp) + self._check(idx, f, exp) + + def test_pi_ops_offset(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + f = lambda x: x + offsets.Day() + exp = PeriodIndex(['2011-01-02', '2011-02-02', '2011-03-02', + '2011-04-02'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x + offsets.Day(2) + exp = PeriodIndex(['2011-01-03', '2011-02-03', '2011-03-03', + '2011-04-03'], freq='D', name='idx') + self._check(idx, f, exp) + + f = lambda x: x - offsets.Day(2) + exp = PeriodIndex(['2010-12-30', '2011-01-30', '2011-02-27', + '2011-03-30'], freq='D', name='idx') + self._check(idx, f, exp) + + def test_pi_offset_errors(self): + idx = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01', + '2011-04-01'], freq='D', name='idx') + s = pd.Series(idx) + + # Series op is applied per Period instance, thus error is raised + # from Period + msg_idx = "Input has different freq from PeriodIndex\(freq=D\)" + msg_s = "Input cannot be converted to Period\(freq=D\)" + for obj, msg in [(idx, msg_idx), (s, msg_s)]: + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj + offsets.Hour(2) - msg = "Input has different freq from PeriodIndex\(freq=H\)" - with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): - idx + np.array([np.timedelta64(1, 's')] * 4) + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + offsets.Hour(2) + obj - idx = PeriodIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', 'NaT', - '2011-01-01 12:00:00'], freq='S', name='idx') - result = idx + np.array([np.timedelta64(1, 'h'), np.timedelta64( - 30, 's'), np.timedelta64(2, 'h'), np.timedelta64(15, 'm')]) - exp = PeriodIndex(['2011-01-01 10:00:00', '2011-01-01 10:00:30', 'NaT', - '2011-01-01 12:15:00'], freq='S', name='idx') - self.assert_index_equal(result, exp) + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + obj - offsets.Hour(2) def test_pi_sub_period(self): # GH 13071 @@ -3903,7 +3994,7 @@ def test_equal(self): self.assertEqual(self.january1, self.january2) def test_equal_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 == self.day def test_notEqual(self): @@ -3914,7 +4005,7 @@ def test_greater(self): self.assertTrue(self.february > self.january1) def test_greater_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 > self.day def test_greater_Raises_Type(self): @@ -3925,8 +4016,9 @@ def test_greaterEqual(self): self.assertTrue(self.january1 >= self.january2) def test_greaterEqual_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 >= self.day + with tm.assertRaises(TypeError): print(self.january1 >= 1) @@ -3934,7 +4026,7 @@ def test_smallerEqual(self): self.assertTrue(self.january1 <= self.january2) def test_smallerEqual_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 <= self.day def test_smallerEqual_Raises_Type(self): @@ -3945,7 +4037,7 @@ def test_smaller(self): self.assertTrue(self.january1 < self.february) def test_smaller_Raises_Value(self): - with tm.assertRaises(ValueError): + with tm.assertRaises(period.IncompatibleFrequency): self.january1 < self.day def test_smaller_Raises_Type(self): @@ -4033,7 +4125,7 @@ def test_pi_pi_comp(self): with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): Period('2011', freq='A') >= base - with tm.assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): idx = PeriodIndex(['2011', '2012', '2013', '2014'], freq='A') base <= idx diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 37b16684643be..8e6341c6b7cc3 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -637,13 +637,13 @@ def test_resample_empty_series(self): methods = [method for method in resample_methods if method != 'ohlc'] for method in methods: - expected_index = s.index._shallow_copy(freq=freq) - result = getattr(s.resample(freq), method)() - expected = s - assert_index_equal(result.index, expected_index) - # freq equality not yet checked in assert_index_equal - self.assertEqual(result.index.freq, expected_index.freq) + + expected = s.copy() + expected.index = s.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) + if (method == 'size' and isinstance(result.index, PeriodIndex) and freq in ['M', 'D']): @@ -665,13 +665,12 @@ def test_resample_empty_dataframe(self): # count retains dimensions too methods = downsample_methods + ['count'] for method in methods: - expected_index = f.index._shallow_copy(freq=freq) result = getattr(f.resample(freq), method)() - expected = f - assert_index_equal(result.index, expected_index) - # freq equality not yet checked in assert_index_equal - # TODO: remove when freq checked - self.assertEqual(result.index.freq, expected_index.freq) + + expected = f.copy() + expected.index = f.index._shallow_copy(freq=freq) + assert_index_equal(result.index, expected.index) + self.assertEqual(result.index.freq, expected.index.freq) assert_frame_equal(result, expected, check_dtype=False) # test size for GH13212 (currently stays as df) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 0ec2c96dbbd7d..39b4cca85ad9c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -753,6 +753,8 @@ def _get_ilevel_values(index, level): # metadata comparison if check_names: assert_attr_equal('names', left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal('freq', left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): From 19ebee56951be8a04c5907dbf657ee0bd8630f41 Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Sun, 22 May 2016 15:44:12 -0400 Subject: [PATCH 41/96] ENH: support decimal option in PythonParser #12933 closes #12933 Author: Camilo Cota Closes #13189 from camilocot/12933 and squashes the following commits: 465272e [Camilo Cota] Benchmark decimal option in read_csv for c engine 9f42d0c [Camilo Cota] double backticks around decimal and engine='python' dc8ca62 [Camilo Cota] fix test_empty_decimal_marker comment 49613fe [Camilo Cota] Assert read_csv error message in test_empty_decimal_marker d821052 [Camilo Cota] fix test_empty_decimal_marker comment f71509d [Camilo Cota] Include descritive what's new line 803356e [Camilo Cota] set nonnum regex in init method 1472d80 [Camilo Cota] Include the issue number in what's new b560fda [Camilo Cota] Fix what's new dc7acd1 [Camilo Cota] ENH: support decimal option in PythonParser #12933 --- asv_bench/benchmarks/parser_vb.py | 60 +++++++++++++++++++++++-- doc/source/whatsnew/v0.18.2.txt | 2 + pandas/io/parsers.py | 37 ++++++++++++--- pandas/io/tests/parser/c_parser_only.py | 45 ------------------- pandas/io/tests/parser/common.py | 53 ++++++++++++++++++++-- 5 files changed, 137 insertions(+), 60 deletions(-) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 18cd4de6cc9c5..04f25034638cd 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -23,18 +23,42 @@ class read_csv_default_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_default_converter(self): read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) +class read_csv_default_converter_with_decimal(object): + goal_time = 0.2 + + def setup(self): + self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n +0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n +0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n +0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n +0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n""" + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',') + + class read_csv_precise_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_precise_converter(self): @@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_roundtrip_converter(self): @@ -109,4 +137,28 @@ def setup(self): self.data = (self.data * 200) def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) + + +class read_csv_default_converter_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') + + +class read_csv_default_converter_with_decimal_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',', engine='python') diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 4b3c96da10efd..cbf95a10447d5 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -47,6 +47,8 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) + .. _whatsnew_0182.api: API changes diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 25639984e4ccf..07b92fd6bfd28 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds): 'keep_default_na': True, 'thousands': None, 'comment': None, + 'decimal': b'.', # 'engine': 'c', 'parse_dates': False, @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.', 'float_precision': None } @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines', 'warn_bad_lines', 'dtype', - 'decimal', 'float_precision', ]) @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds): self.converters = kwds['converters'] self.thousands = kwds['thousands'] + self.decimal = kwds['decimal'] self.comment = kwds['comment'] self._comment_lines = [] @@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds): else: self._no_thousands_columns = None + if len(self.decimal) != 1: + raise ValueError('Only length-1 decimal markers supported') + + if self.thousands is None: + self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) + else: + self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, + self.decimal)) + def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands # operators. @@ -2050,22 +2059,35 @@ def _check_empty(self, lines): def _check_thousands(self, lines): if self.thousands is None: return lines - nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + + return self._search_replace_num_columns(lines=lines, + search=self.thousands, + replace='') + + def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or - self.thousands not in x or + search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or - nonnum.search(x.strip())): + self.nonnum.search(x.strip())): rl.append(x) else: - rl.append(x.replace(self.thousands, '')) + rl.append(x.replace(search, replace)) ret.append(rl) return ret + def _check_decimal(self, lines): + if self.decimal == _parser_defaults['decimal']: + return lines + + return self._search_replace_num_columns(lines=lines, + search=self.decimal, + replace='.') + def _clear_buffer(self): self.buf = [] @@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None): lines = self._check_comments(lines) if self.skip_blank_lines: lines = self._check_empty(lines) - return self._check_thousands(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) def _make_date_converter(date_parser=None, dayfirst=False, diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 24c670abe8158..8e44802adf744 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self): result = self.read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEqual(result['B'][2], '') - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - def test_custom_lineterminator(self): data = 'a,b,c~1,2,3~4,5,6' @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self): data = "\n\n\n" self.assertRaises(ValueError, self.read_csv, StringIO(data)) - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - tm.assert_equal(expected.A.dtype, 'int64') - tm.assert_equal(expected.B.dtype, 'float') - tm.assert_equal(expected.C.dtype, 'float') - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - def test_grow_boundary_at_cap(self): # See gh-12494 # diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4d9ce922184d9..57ab9477302c1 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -41,10 +41,10 @@ def test_empty_decimal_marker(self): 1|2,334|5 10|13|10. """ - # C parser: supports only length-1 decimals - # Python parser: 'decimal' not supported yet - self.assertRaises(ValueError, self.read_csv, - StringIO(data), decimal='') + # Parsers support only length-1 decimals + msg = 'Only length-1 decimal markers supported' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), decimal='') def test_read_csv(self): if not compat.PY3: @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self): result = self.read_table(f, squeeze=True, header=None) expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) tm.assert_series_equal(result, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float) From f8a11ddccd98c4da3366294cc7ffd2d82ffb4106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Mon, 23 May 2016 08:58:45 -0400 Subject: [PATCH 42/96] ERR: Correct ValueError invalid type promotion exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #12599 Author: Gábor Lipták Closes #13234 from gliptak/invalidtypepromotion and squashes the following commits: 88f144b [Gábor Lipták] Correct ValueError invalid type promotion exception --- doc/source/whatsnew/v0.18.2.txt | 31 +++++++++++++++++++++++++++- pandas/core/indexing.py | 9 +++++--- pandas/tests/series/test_indexing.py | 10 +++++++++ 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index cbf95a10447d5..1e2ea618a4a65 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -59,7 +59,6 @@ API changes - An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) - .. _whatsnew_0182.api.tolist: ``Series.tolist()`` will now return Python types @@ -88,6 +87,36 @@ New Behavior: type(s.tolist()[0]) +.. _whatsnew_0182.api.promote: + +``Series`` type promotoion on assignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` will now correctly promote its dtype with assignment with incompat values to the current dtype (:issue:`13234`) + + +.. ipython:: python + + s = pd.Series() + +Previous Behavior: + +.. code-block:: ipython + + In [2]: s["a"] = pd.Timestamp("2016-01-01") + + In [3]: s["b"] = 3.0 + TypeError: invalid type promotion + +New Behavior: + +.. ipython:: python + + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s + s.dtype + .. _whatsnew_0182.api.to_datetime_coerce: ``.to_datetime()`` when coercing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index acb0675247a78..9485f50ed07f1 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -336,9 +336,12 @@ def _setitem_with_indexer(self, indexer, value): # this preserves dtype of the value new_values = Series([value])._values if len(self.obj._values): - new_values = np.concatenate([self.obj._values, - new_values]) - + try: + new_values = np.concatenate([self.obj._values, + new_values]) + except TypeError: + new_values = np.concatenate([self.obj.asobject, + new_values]) self.obj._data = self.obj._constructor( new_values, index=new_index, name=self.obj.name)._data self.obj._maybe_update_cacher(clear=True) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 5ed3fda7d0b8f..29cd887c7075f 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -287,6 +287,16 @@ def test_getitem_generator(self): assert_series_equal(result, expected) assert_series_equal(result2, expected) + def test_type_promotion(self): + # GH12599 + s = pd.Series() + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s["c"] = "foo" + expected = Series([pd.Timestamp("2016-01-01"), 3.0, "foo"], + index=["a", "b", "c"]) + assert_series_equal(s, expected) + def test_getitem_boolean_object(self): # using column from DataFrame From afde7187e22b2013147d0a15911f6ec72e056a43 Mon Sep 17 00:00:00 2001 From: pijucha Date: Mon, 23 May 2016 16:34:20 -0400 Subject: [PATCH 43/96] BUG: Fix #13149 and ENH: 'copy' param in Index.astype() closes #13149 1. Float64Index.astype(int) raises ValueError if a NaN is present. Previously, it converted NaN's to the smallest negative integer. 2. TimedeltaIndex.astype(int) and DatetimeIndex.astype(int) return. Int64Index, which is consistent with behavior of other Indexes. Previously, they returned a numpy.array of ints. 3. Added bool parameter 'copy' to Index.astype() 4. Fixed core.common.is_timedelta64_ns_dtype(). 5. Set a default NaT representation to a string type in a parameter of DatetimeIndex._format_native_types(). Previously, it produced a unicode u'NaT' in Python2. Author: pijucha Closes #13209 from pijucha/bug13149 and squashes the following commits: 8b29902 [pijucha] BUG: Fix #13149 and ENH: 'copy' param in Index.astype() --- doc/source/whatsnew/v0.18.2.txt | 6 + pandas/core/categorical.py | 21 ++- pandas/core/common.py | 2 +- pandas/core/ops.py | 6 +- pandas/indexes/base.py | 24 ++- pandas/indexes/multi.py | 7 +- pandas/indexes/numeric.py | 15 +- pandas/tests/indexes/test_datetimelike.py | 198 +++++++++++++++++++++- pandas/tests/indexes/test_numeric.py | 5 + pandas/tests/test_common.py | 15 ++ pandas/tseries/index.py | 41 +++-- pandas/tseries/period.py | 17 +- pandas/tseries/tdi.py | 32 ++-- pandas/tseries/tests/test_base.py | 33 ---- pandas/tseries/tests/test_period.py | 6 - pandas/tseries/tests/test_timedeltas.py | 6 - pandas/tseries/tests/test_timeseries.py | 28 --- 17 files changed, 330 insertions(+), 132 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 1e2ea618a4a65..e2e40b643ba99 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -49,6 +49,9 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) +- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) + .. _whatsnew_0182.api: API changes @@ -143,6 +146,9 @@ This will now convert integers/floats with the default unit of ``ns``. Other API changes ^^^^^^^^^^^^^^^^^ +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) + .. _whatsnew_0182.deprecations: Deprecations diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 44c91862227d8..f0a83cbe77d3c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -336,11 +336,26 @@ def copy(self): categories=self.categories, ordered=self.ordered, fastpath=True) - def astype(self, dtype): - """ coerce this type to another dtype """ + def astype(self, dtype, copy=True): + """ + Coerce this type to another dtype + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and dtype is categorical, the original + object is returned. + + .. versionadded:: 0.18.2 + + """ if is_categorical_dtype(dtype): + if copy is True: + return self.copy() return self - return np.array(self, dtype=dtype) + return np.array(self, dtype=dtype, copy=copy) @cache_readonly def ndim(self): diff --git a/pandas/core/common.py b/pandas/core/common.py index 8af6b78a050f3..1be6ce810791b 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1600,7 +1600,7 @@ def is_timedelta64_dtype(arr_or_dtype): def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) + tipo = _get_dtype(arr_or_dtype) return tipo == _TD_DTYPE diff --git a/pandas/core/ops.py b/pandas/core/ops.py index b02f94cc92e22..d1bb67fa0bc13 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -422,7 +422,7 @@ def _convert_to_array(self, values, name=None, other=None): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here - values = to_timedelta(values, errors='coerce') + values = to_timedelta(values, errors='coerce', box=False) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': @@ -504,9 +504,9 @@ def _offset(lvalues, rvalues): # convert Tick DateOffset to underlying delta if self.is_offset_lhs: - lvalues = to_timedelta(lvalues) + lvalues = to_timedelta(lvalues, box=False) if self.is_offset_rhs: - rvalues = to_timedelta(rvalues) + rvalues = to_timedelta(rvalues, box=False) lvalues = lvalues.astype(np.int64) if not self.is_floating_rhs: diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index dc178c1178c74..c029a4a74d9d0 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -754,8 +754,28 @@ def _to_embed(self, keep_tz=False): """ return self.values.copy() - def astype(self, dtype): - return Index(self.values.astype(dtype), name=self.name, dtype=dtype) + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. + + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. + + .. versionadded:: 0.18.2 + + """ + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 6f3360cdf82a7..9f71f9f17d835 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2074,11 +2074,14 @@ def difference(self, other): return MultiIndex.from_tuples(difference, sortorder=0, names=result_names) - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): if not is_object_dtype(np.dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) - return self._shallow_copy() + elif copy is True: + return self._shallow_copy() + return self def _convert_can_do_setop(self, other): result_names = self.names diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 983ea731b11ac..0deaf4da9b2bb 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -4,7 +4,7 @@ import pandas.index as _index from pandas import compat -from pandas.indexes.base import Index, InvalidIndexError +from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly import pandas.core.common as com from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype, @@ -238,12 +238,17 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, def inferred_type(self): return 'floating' - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) - if is_float_dtype(dtype) or is_integer_dtype(dtype): - values = self._values.astype(dtype) + if is_float_dtype(dtype): + values = self._values.astype(dtype, copy=copy) + elif is_integer_dtype(dtype): + if self.hasnans: + raise ValueError('cannot convert float NaN to integer') + values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): - values = self._values + values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index b0ca07e84f7ce..bd3deb8e6ed36 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -4,9 +4,10 @@ import numpy as np -from pandas import (date_range, period_range, - Series, Index, DatetimeIndex, - TimedeltaIndex, PeriodIndex) +from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, + NaT, Period, PeriodIndex, Series, Timedelta, + TimedeltaIndex, date_range, period_range, + timedelta_range) import pandas.util.testing as tm @@ -337,6 +338,117 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + def test_astype(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timestamp('2016-05-16')] + [NaT] * 3, dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([1463356800000000000] + + [-9223372036854775808] * 3, dtype=np.int64) + tm.assert_index_equal(result, expected) + + rng = date_range('1/1/2000', periods=10) + result = rng.astype('i8') + self.assert_numpy_array_equal(result, rng.asi8) + + def test_astype_with_tz(self): + + # with tz + rng = date_range('1/1/2000', periods=10, tz='US/Eastern') + result = rng.astype('datetime64[ns]') + expected = (date_range('1/1/2000', periods=10, + tz='US/Eastern') + .tz_convert('UTC').tz_localize(None)) + tm.assert_index_equal(result, expected) + + # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex + result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) + expected = pd.Series( + ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) + tm.assert_series_equal(result, expected) + + result = Series(pd.date_range('2012-01-01', periods=3, + tz='US/Eastern')).astype(str) + expected = Series(['2012-01-01 00:00:00-05:00', + '2012-01-02 00:00:00-05:00', + '2012-01-03 00:00:00-05:00'], + dtype=object) + tm.assert_series_equal(result, expected) + + def test_astype_str_compat(self): + # GH 13149, GH 13209 + # verify that we are returing NaT as a string (and not unicode) + + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + result = idx.astype(str) + expected = Index(['2016-05-16', 'NaT', 'NaT', 'NaT'], dtype=object) + tm.assert_index_equal(result, expected) + + def test_astype_str(self): + # test astype string - #10442 + result = date_range('2012-01-01', periods=4, + name='test_name').astype(str) + expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', + '2012-01-04'], name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with tz and name + result = date_range('2012-01-01', periods=3, name='test_name', + tz='US/Eastern').astype(str) + expected = Index(['2012-01-01 00:00:00-05:00', + '2012-01-02 00:00:00-05:00', + '2012-01-03 00:00:00-05:00'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and name + result = date_range('1/1/2011', periods=3, freq='H', + name='test_name').astype(str) + expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', + '2011-01-01 02:00:00'], + name='test_name', dtype=object) + tm.assert_index_equal(result, expected) + + # test astype string with freqH and timezone + result = date_range('3/6/2012 00:00', periods=2, freq='H', + tz='Europe/London', name='test_name').astype(str) + expected = Index(['2012-03-06 00:00:00+00:00', + '2012-03-06 01:00:00+00:00'], + dtype=object, name='test_name') + tm.assert_index_equal(result, expected) + + def test_astype_datetime64(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + result = idx.astype('datetime64[ns]') + tm.assert_index_equal(result, idx) + self.assertFalse(result is idx) + + result = idx.astype('datetime64[ns]', copy=False) + tm.assert_index_equal(result, idx) + self.assertTrue(result is idx) + + idx_tz = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN], tz='EST') + result = idx_tz.astype('datetime64[ns]') + expected = DatetimeIndex(['2016-05-16 05:00:00', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]') + tm.assert_index_equal(result, expected) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) + + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, 'timedelta64') + self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') + self.assertRaises(ValueError, idx.astype, 'datetime64') + self.assertRaises(ValueError, idx.astype, 'datetime64[D]') + def test_get_loc(self): idx = pd.date_range('2000-01-01', periods=3) @@ -585,6 +697,42 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') + def test_astype(self): + # GH 13149, GH 13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + + result = idx.astype(object) + expected = Index([Period('2016-05-16', freq='D')] + + [Period(NaT, freq='D')] * 3, dtype='object') + # Hack because of lack of support for Period null checking (GH12759) + tm.assert_index_equal(result[:1], expected[:1]) + result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64) + expected_arr = np.asarray([p.ordinal for p in expected], + dtype=np.int64) + tm.assert_numpy_array_equal(result_arr, expected_arr) + # TODO: When GH12759 is resolved, change the above hack to: + # tm.assert_index_equal(result, expected) # now, it raises. + + result = idx.astype(int) + expected = Int64Index([16937] + [-9223372036854775808] * 3, + dtype=np.int64) + tm.assert_index_equal(result, expected) + + idx = period_range('1990', '2009', freq='A') + result = idx.astype('i8') + self.assert_numpy_array_equal(result, idx.values) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') + + self.assertRaises(ValueError, idx.astype, str) + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, 'timedelta64') + self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') + self.assertRaises(ValueError, idx.astype, 'datetime64') + self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') + def test_shift(self): # test shift for PeriodIndex @@ -726,6 +874,50 @@ def test_shift(self): '10 days 01:00:03'], freq='D') self.assert_index_equal(result, expected) + def test_astype(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + result = idx.astype(object) + expected = Index([Timedelta('1 days 03:46:40')] + [pd.NaT] * 3, + dtype=object) + tm.assert_index_equal(result, expected) + + result = idx.astype(int) + expected = Int64Index([100000000000000] + [-9223372036854775808] * 3, + dtype=np.int64) + tm.assert_index_equal(result, expected) + + rng = timedelta_range('1 days', periods=10) + + result = rng.astype('i8') + self.assert_numpy_array_equal(result, rng.asi8) + + def test_astype_timedelta64(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + result = idx.astype('timedelta64') + expected = Float64Index([1e+14] + [np.NaN] * 3, dtype='float64') + tm.assert_index_equal(result, expected) + + result = idx.astype('timedelta64[ns]') + tm.assert_index_equal(result, idx) + self.assertFalse(result is idx) + + result = idx.astype('timedelta64[ns]', copy=False) + tm.assert_index_equal(result, idx) + self.assertTrue(result is idx) + + def test_astype_raises(self): + # GH 13149, GH 13209 + idx = TimedeltaIndex([1e14, 'NaT', pd.NaT, np.NaN]) + + self.assertRaises(ValueError, idx.astype, float) + self.assertRaises(ValueError, idx.astype, str) + self.assertRaises(ValueError, idx.astype, 'datetime64') + self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') + def test_get_loc(self): idx = pd.to_timedelta(['0 days', '1 days', '2 days']) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 8592ae1741a4e..abb9d55e27758 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -259,6 +259,11 @@ def test_astype(self): for dtype in ['M8[ns]', 'm8[ns]']: self.assertRaises(TypeError, lambda: i.astype(dtype)) + # GH 13149 + for dtype in ['int16', 'int32', 'int64']: + i = Float64Index([0, 1.1, np.NAN]) + self.assertRaises(ValueError, lambda: i.astype(dtype)) + def test_equals(self): i = Float64Index([1.0, 2.0]) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e1b186f6c21e5..ad43dc1c09ef1 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -817,6 +817,21 @@ def test_dict_compat(): assert (com._dict_compat(data_unchanged) == data_unchanged) +def test_is_timedelta(): + assert (com.is_timedelta64_dtype('timedelta64')) + assert (com.is_timedelta64_dtype('timedelta64[ns]')) + assert (not com.is_timedelta64_ns_dtype('timedelta64')) + assert (com.is_timedelta64_ns_dtype('timedelta64[ns]')) + + tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') + assert (com.is_timedelta64_dtype(tdi)) + assert (com.is_timedelta64_ns_dtype(tdi)) + assert (com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) + # Conversion to Int64Index: + assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) + assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 25d3490873542..83ab5d2a2bce4 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,16 +6,17 @@ from datetime import timedelta import numpy as np from pandas.core.base import _shared_docs -from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, - _values_from_object, _maybe_box, - is_object_dtype, is_datetime64_dtype, - is_datetimetz, is_dtype_equal, - ABCSeries, is_integer, is_float, - DatetimeTZDtype, PerformanceWarning) +from pandas.core.common import (_INT64_DTYPE, _NS_DTYPE, _maybe_box, + _values_from_object, ABCSeries, + DatetimeTZDtype, PerformanceWarning, + is_datetimetz, is_datetime64_dtype, + is_datetime64_ns_dtype, is_dtype_equal, + is_float, is_integer, is_integer_dtype, + is_object_dtype, is_string_dtype) from pandas.core.index import Index, Int64Index, Float64Index +from pandas.indexes.base import _index_shared_docs import pandas.compat as compat -from pandas.compat import u from pandas.tseries.frequencies import ( to_offset, get_period_alias, Resolution) @@ -814,8 +815,7 @@ def _add_offset(self, offset): "or DatetimeIndex", PerformanceWarning) return self.astype('O') + offset - def _format_native_types(self, na_rep=u('NaT'), - date_format=None, **kwargs): + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -827,19 +827,24 @@ def _format_native_types(self, na_rep=u('NaT'), def to_datetime(self, dayfirst=False): return self.copy() - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: + if is_object_dtype(dtype): return self.asobject - elif dtype == _INT64_DTYPE: - return self.asi8.copy() - elif dtype == _NS_DTYPE and self.tz is not None: - return self.tz_convert('UTC').tz_localize(None) - elif dtype == str: + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), name=self.name, + dtype='i8') + elif is_datetime64_ns_dtype(dtype): + if self.tz is not None: + return self.tz_convert('UTC').tz_localize(None) + elif copy is True: + return self.copy() + return self + elif is_string_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) - else: # pragma: no cover - raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) + raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): utc = _utc() diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index b690bc23c2496..c3deee5f6dab2 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -15,11 +15,12 @@ _quarter_to_myear) from pandas.core.base import _shared_docs +from pandas.indexes.base import _index_shared_docs import pandas.core.common as com from pandas.core.common import ( - isnull, _INT64_DTYPE, _maybe_box, _values_from_object, ABCSeries, - is_integer, is_float) + _maybe_box, _values_from_object, ABCSeries, is_float, is_integer, + is_integer_dtype, is_object_dtype, isnull) from pandas import compat from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution @@ -386,12 +387,14 @@ def asof_locs(self, where, mask): def _array_values(self): return self.asobject - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: - return Index(np.array(list(self), dtype), dtype) - elif dtype == _INT64_DTYPE: - return Index(self.values, dtype) + if is_object_dtype(dtype): + return self.asobject + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), name=self.name, + dtype='i8') raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex', value='key') diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7d731c28c0f88..3e12cf14e7485 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -2,15 +2,17 @@ from datetime import timedelta import numpy as np -from pandas.core.common import (ABCSeries, _TD_DTYPE, _INT64_DTYPE, - _maybe_box, +from pandas.core.common import (ABCSeries, _TD_DTYPE, _maybe_box, _values_from_object, isnull, - is_integer, is_float) + is_integer, is_float, is_integer_dtype, + is_object_dtype, is_timedelta64_dtype, + is_timedelta64_ns_dtype) from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset from pandas.core.base import _shared_docs +from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat from pandas.util.decorators import Appender, Substitution @@ -435,28 +437,28 @@ def to_pytimedelta(self): """ return tslib.ints_to_pytimedelta(self.asi8) - def astype(self, dtype): + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): dtype = np.dtype(dtype) - if dtype == np.object_: + if is_object_dtype(dtype): return self.asobject - elif dtype == _INT64_DTYPE: - return self.asi8.copy() - elif dtype == _TD_DTYPE: + elif is_timedelta64_ns_dtype(dtype): + if copy is True: + return self.copy() return self - elif dtype.kind == 'm': - + elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) - result = self.values.astype(dtype) + result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert='float64'), name=self.name) - return Index(result.astype('i8'), name=self.name) - - else: # pragma: no cover - raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) + elif is_integer_dtype(dtype): + return Index(self.values.astype('i8', copy=copy), dtype='i8', + name=self.name) + raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype) def union(self, other): """ diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 2077409f4afec..97b551070f541 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -50,39 +50,6 @@ def test_ops_properties_basic(self): self.assertEqual(s.day, 10) self.assertRaises(AttributeError, lambda: s.weekday) - def test_astype_str(self): - # test astype string - #10442 - result = date_range('2012-01-01', periods=4, - name='test_name').astype(str) - expected = Index(['2012-01-01', '2012-01-02', '2012-01-03', - '2012-01-04'], name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with tz and name - result = date_range('2012-01-01', periods=3, name='test_name', - tz='US/Eastern').astype(str) - expected = Index(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and name - result = date_range('1/1/2011', periods=3, freq='H', - name='test_name').astype(str) - expected = Index(['2011-01-01 00:00:00', '2011-01-01 01:00:00', - '2011-01-01 02:00:00'], - name='test_name', dtype=object) - tm.assert_index_equal(result, expected) - - # test astype string with freqH and timezone - result = date_range('3/6/2012 00:00', periods=2, freq='H', - tz='Europe/London', name='test_name').astype(str) - expected = Index(['2012-03-06 00:00:00+00:00', - '2012-03-06 01:00:00+00:00'], - dtype=object, name='test_name') - tm.assert_index_equal(result, expected) - def test_asobject_tolist(self): idx = pd.date_range(start='2013-01-01', periods=4, freq='M', name='idx') diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 8ebdcc7acff2d..167690e4846e9 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1631,12 +1631,6 @@ def test_make_time_series(self): series = Series(1, index=index) tm.assertIsInstance(series, Series) - def test_astype(self): - idx = period_range('1990', '2009', freq='A') - - result = idx.astype('i8') - self.assert_numpy_array_equal(result, idx.values) - def test_constructor_use_start_freq(self): # GH #1118 p = Period('4/2/2012', freq='B') diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 8474bbbc91931..8d02c43e68be3 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1191,12 +1191,6 @@ def test_append_numpy_bug_1681(self): result = a.append(c) self.assertTrue((result['B'] == td).all()) - def test_astype(self): - rng = timedelta_range('1 days', periods=10) - - result = rng.astype('i8') - self.assert_numpy_array_equal(result, rng.asi8) - def test_fields(self): rng = timedelta_range('1 days, 10:11:12.100123456', periods=2, freq='s') diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 46f02c718a09f..50cf38be62779 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2563,34 +2563,6 @@ def test_append_join_nondatetimeindex(self): # it works rng.join(idx, how='outer') - def test_astype(self): - rng = date_range('1/1/2000', periods=10) - - result = rng.astype('i8') - self.assert_numpy_array_equal(result, rng.asi8) - - # with tz - rng = date_range('1/1/2000', periods=10, tz='US/Eastern') - result = rng.astype('datetime64[ns]') - expected = (date_range('1/1/2000', periods=10, - tz='US/Eastern') - .tz_convert('UTC').tz_localize(None)) - tm.assert_index_equal(result, expected) - - # BUG#10442 : testing astype(str) is correct for Series/DatetimeIndex - result = pd.Series(pd.date_range('2012-01-01', periods=3)).astype(str) - expected = pd.Series( - ['2012-01-01', '2012-01-02', '2012-01-03'], dtype=object) - tm.assert_series_equal(result, expected) - - result = Series(pd.date_range('2012-01-01', periods=3, - tz='US/Eastern')).astype(str) - expected = Series(['2012-01-01 00:00:00-05:00', - '2012-01-02 00:00:00-05:00', - '2012-01-03 00:00:00-05:00'], - dtype=object) - tm.assert_series_equal(result, expected) - def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) self.assertRaises(ValueError, idx.to_period) From 9a6ce07ce19adb6d8ded8af2ef66326d6750171e Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 23 May 2016 17:41:32 -0400 Subject: [PATCH 44/96] BUG, ENH: Add support for parsing duplicate columns Closes #7160 Closes #9424 Author: gfyoung Closes #12935 from gfyoung/dupe-col-names and squashes the following commits: ef7636f [gfyoung] BUG, ENH: Add support for parsing duplicate columns --- doc/source/io.rst | 41 ++++++++++++++- doc/source/whatsnew/v0.18.2.txt | 27 ++++++++++ pandas/io/parsers.py | 58 ++++++++++++++++++---- pandas/io/tests/parser/c_parser_only.py | 23 ++++----- pandas/io/tests/parser/common.py | 21 ++++++-- pandas/io/tests/parser/test_parsers.py | 7 --- pandas/io/tests/parser/test_unsupported.py | 10 ++++ 7 files changed, 149 insertions(+), 38 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index af8bca14e5d6f..104172d9574f1 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -120,7 +120,8 @@ header : int or list of ints, default ``'infer'`` rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should - explicitly pass ``header=None``. + explicitly pass ``header=None``. Duplicates in this list are not allowed unless + ``mangle_dupe_cols=True``, which is the default. index_col : int or sequence or ``False``, default ``None`` Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end of @@ -139,6 +140,8 @@ prefix : str, default ``None`` Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default ``True`` Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'. + Passing in False will cause data to be overwritten if there are duplicate + names in the columns. General Parsing Configuration +++++++++++++++++++++++++++++ @@ -432,6 +435,42 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. _io.dupe_names: + +Duplicate names parsing +''''''''''''''''''''''' + +If the file or header contains duplicate names, pandas by default will deduplicate +these names so as to prevent data overwrite: + +.. ipython :: python + + data = 'a,b,a\n0,1,2\n3,4,5' + pd.read_csv(StringIO(data)) + +There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies +a series of duplicate columns 'X'...'X' to become 'X.0'...'X.N'. If ``mangle_dupe_cols +=False``, duplicate data can arise: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + Out[3]: + a b a + 0 2 1 2 + 1 5 4 5 + +To prevent users from encountering this problem with duplicate data, a ``ValueError`` +exception is raised if ``mangle_dupe_cols != True``: + +.. code-block :: python + + In [2]: data = 'a,b,a\n0,1,2\n3,4,5' + In [3]: pd.read_csv(StringIO(data), mangle_dupe_cols=False) + ... + ValueError: Setting mangle_dupe_cols=False is not supported yet + .. _io.usecols: Filtering columns (``usecols``) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index e2e40b643ba99..2854dbf5e655b 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -19,10 +19,37 @@ Highlights include: New features ~~~~~~~~~~~~ +.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: +``pd.read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether +they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) +.. ipython :: python + data = '0,1,2\n3,4,5' + names = ['a', 'b', 'a'] + +Previous behaviour: + +.. code-block:: ipython + + In [2]: pd.read_csv(StringIO(data), names=names) + Out[2]: + a b a + 0 2 1 2 + 1 5 4 5 + +The first 'a' column contains the same data as the second 'a' column, when it should have +contained the array ``[0, 3]``. + +New behaviour: + +.. ipython :: python + + In [2]: pd.read_csv(StringIO(data), names=names) .. _whatsnew_0182.enhancements.other: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 07b92fd6bfd28..c939864d7a38b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -73,7 +73,8 @@ rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you - should explicitly pass header=None + should explicitly pass header=None. Duplicates in this list are not + allowed unless mangle_dupe_cols=True, which is the default. index_col : int or sequence or False, default None Column to use as the row labels of the DataFrame. If a sequence is given, a MultiIndex is used. If you have a malformed file with delimiters at the end @@ -91,7 +92,9 @@ prefix : str, default None Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ... mangle_dupe_cols : boolean, default True - Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X' + Duplicate columns will be specified as 'X.0'...'X.N', rather than + 'X'...'X'. Passing in False will cause data to be overwritten if there + are duplicate names in the columns. dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python'). Use `str` or `object` to preserve and @@ -655,7 +658,14 @@ def _get_options_with_defaults(self, engine): options = {} for argname, default in compat.iteritems(_parser_defaults): - options[argname] = kwds.get(argname, default) + value = kwds.get(argname, default) + + # see gh-12935 + if argname == 'mangle_dupe_cols' and not value: + raise ValueError('Setting mangle_dupe_cols=False is ' + 'not supported yet') + else: + options[argname] = value for argname, default in compat.iteritems(_c_parser_defaults): if argname in kwds: @@ -899,6 +909,7 @@ def __init__(self, kwds): self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.tupleize_cols = kwds.get('tupleize_cols', False) + self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) self._date_conv = _make_date_converter( @@ -1012,6 +1023,26 @@ def tostr(x): return names, index_names, col_names, passed_names + def _maybe_dedup_names(self, names): + # see gh-7160 and gh-9424: this helps to provide + # immediate alleviation of the duplicate names + # issue and appears to be satisfactory to users, + # but ultimately, not needing to butcher the names + # would be nice! + if self.mangle_dupe_cols: + names = list(names) # so we can index + counts = {} + + for i, col in enumerate(names): + cur_count = counts.get(col, 0) + + if cur_count > 0: + names[i] = '%s.%d' % (col, cur_count) + + counts[col] = cur_count + 1 + + return names + def _maybe_make_multi_index_columns(self, columns, col_names=None): # possibly create a column mi here if (not self.tupleize_cols and len(columns) and @@ -1314,10 +1345,11 @@ def read(self, nrows=None): except StopIteration: if self._first_chunk: self._first_chunk = False + names = self._maybe_dedup_names(self.orig_names) index, columns, col_dict = _get_empty_meta( - self.orig_names, self.index_col, - self.index_names, dtype=self.kwds.get('dtype')) + names, self.index_col, self.index_names, + dtype=self.kwds.get('dtype')) if self.usecols is not None: columns = self._filter_usecols(columns) @@ -1361,6 +1393,8 @@ def read(self, nrows=None): if self.usecols is not None: names = self._filter_usecols(names) + names = self._maybe_dedup_names(names) + # rename dict keys data = sorted(data.items()) data = dict((k, v) for k, (i, v) in zip(names, data)) @@ -1373,6 +1407,7 @@ def read(self, nrows=None): # ugh, mutation names = list(self.orig_names) + names = self._maybe_dedup_names(names) if self.usecols is not None: names = self._filter_usecols(names) @@ -1567,7 +1602,6 @@ def __init__(self, f, **kwds): self.skipinitialspace = kwds['skipinitialspace'] self.lineterminator = kwds['lineterminator'] self.quoting = kwds['quoting'] - self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.usecols = _validate_usecols_arg(kwds['usecols']) self.skip_blank_lines = kwds['skip_blank_lines'] @@ -1756,8 +1790,8 @@ def read(self, rows=None): columns = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - return _get_empty_meta(self.orig_names, - self.index_col, + names = self._maybe_dedup_names(self.orig_names) + return _get_empty_meta(names, self.index_col, self.index_names) # handle new style for names in index @@ -1770,7 +1804,8 @@ def read(self, rows=None): alldata = self._rows_to_cols(content) data = self._exclude_implicit_index(alldata) - columns, data = self._do_date_conversions(self.columns, data) + columns = self._maybe_dedup_names(self.columns) + columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) index, columns = self._make_index(data, alldata, columns, indexnamerow) @@ -1778,18 +1813,19 @@ def read(self, rows=None): return index, columns, data def _exclude_implicit_index(self, alldata): + names = self._maybe_dedup_names(self.orig_names) if self._implicit_index: excl_indices = self.index_col data = {} offset = 0 - for i, col in enumerate(self.orig_names): + for i, col in enumerate(names): while i + offset in excl_indices: offset += 1 data[col] = alldata[i + offset] else: - data = dict((k, v) for k, v in zip(self.orig_names, alldata)) + data = dict((k, v) for k, v in zip(names, alldata)) return data diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 8e44802adf744..325418f87af6a 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -293,23 +293,18 @@ def test_empty_with_mangled_column_pass_dtype_by_indexes(self): {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_names(self): - data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) - expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) - tm.assert_frame_equal(result, expected, check_index_type=False) - def test_empty_with_dup_column_pass_dtype_by_indexes(self): - # FIXME in gh-9424 - raise nose.SkipTest( - "gh-9424; known failure read_csv with duplicate columns") + # see gh-9424 + expected = pd.concat([Series([], name='one', dtype='u1'), + Series([], name='one.1', dtype='f')], axis=1) data = 'one,one' - result = self.read_csv( - StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) - expected = pd.concat([Series([], name='one', dtype='u1'), - Series([], name='one', dtype='f')], axis=1) + result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) + tm.assert_frame_equal(result, expected, check_index_type=False) + + data = '' + result = self.read_csv(StringIO(data), names=['one', 'one'], + dtype={0: 'u1', 1: 'f'}) tm.assert_frame_equal(result, expected, check_index_type=False) def test_usecols_dtypes(self): diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 57ab9477302c1..90a0b420eed3c 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -243,6 +243,8 @@ def test_unnamed_columns(self): 'Unnamed: 4']) def test_duplicate_columns(self): + # TODO: add test for condition 'mangle_dupe_cols=False' + # once it is actually supported (gh-12935) data = """A,A,B,B,B 1,2,3,4,5 6,7,8,9,10 @@ -256,11 +258,6 @@ def test_duplicate_columns(self): self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - df = getattr(self, method)(StringIO(data), sep=',', - mangle_dupe_cols=False) - self.assertEqual(list(df.columns), - ['A', 'A', 'B', 'B', 'B']) - df = getattr(self, method)(StringIO(data), sep=',', mangle_dupe_cols=True) self.assertEqual(list(df.columns), @@ -1281,3 +1278,17 @@ def test_euro_decimal_format(self): self.assertEqual(df2['Number1'].dtype, float) self.assertEqual(df2['Number2'].dtype, float) self.assertEqual(df2['Number3'].dtype, float) + + def test_read_duplicate_names(self): + # See gh-7160 + data = "a,b,a\n0,1,2\n3,4,5" + df = self.read_csv(StringIO(data)) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) + + data = "0,1,2\n3,4,5" + df = self.read_csv(StringIO(data), names=["a", "b", "a"]) + expected = DataFrame([[0, 1, 2], [3, 4, 5]], + columns=['a', 'b', 'a.1']) + tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index 374485b5ddaad..ea8ce9b616f36 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -84,13 +84,6 @@ def read_table(self, *args, **kwds): class TestPythonParser(BaseParser, PythonParserTests, tm.TestCase): - """ - Class for Python parser testing. Unless specifically stated - as a PythonParser-specific issue, the goal is to eventually move - as many of these tests into ParserTests as soon as the C parser - can accept further specific arguments when parsing. - """ - engine = 'python' float_precision_choices = [None] diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 1813a95d7a306..cefe7d939d1ab 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -20,6 +20,16 @@ class TestUnsupportedFeatures(tm.TestCase): + def test_mangle_dupe_cols_false(self): + # see gh-12935 + data = 'a b c\n1 2 3' + msg = 'is not supported' + + for engine in ('c', 'python'): + with tm.assertRaisesRegexp(ValueError, msg): + read_csv(StringIO(data), engine=engine, + mangle_dupe_cols=False) + def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3' From 8662cb99da2cd84525d13e92e53c46060a9a5323 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 23 May 2016 20:50:45 -0400 Subject: [PATCH 45/96] TST: assert_dict_equal to check input type Author: sinhrks Closes #13264 from sinhrks/test_dict and squashes the following commits: 2a7b9b1 [sinhrks] TST: assert_dict_equal to check input type --- pandas/tests/frame/test_constructors.py | 10 ++++++++-- pandas/tests/frame/test_indexing.py | 12 +++++++---- pandas/tests/frame/test_operators.py | 11 ++++++++--- pandas/tests/frame/test_timeseries.py | 12 +++++------ pandas/tests/series/test_analytics.py | 1 + pandas/tests/series/test_combine_concat.py | 5 +++-- pandas/tests/series/test_missing.py | 4 ++-- pandas/tests/series/test_timeseries.py | 23 +++++++++++++--------- pandas/tseries/base.py | 11 ++++++++++- pandas/util/testing.py | 8 +++++++- pandas/util/validators.py | 11 ++++++++++- 11 files changed, 77 insertions(+), 31 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 083da2a040ed5..1d043297aa1fa 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -220,8 +220,14 @@ def test_constructor_dict(self): frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}) - tm.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False) - tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) + # col2 is padded with NaN + self.assertEqual(len(self.ts1), 30) + self.assertEqual(len(self.ts2), 25) + + tm.assert_series_equal(self.ts1, frame['col1'], check_names=False) + exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]), + index=self.ts1.index, name='col2') + tm.assert_series_equal(exp, frame['col2']) frame = DataFrame({'col1': self.ts1, 'col2': self.ts2}, diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 1e3940dc8f038..ca1ebe477e903 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -393,13 +393,17 @@ def test_setitem(self): series = self.frame['A'][::2] self.frame['col5'] = series self.assertIn('col5', self.frame) - tm.assert_dict_equal(series, self.frame['col5'], - compare_keys=False) + + self.assertEqual(len(series), 15) + self.assertEqual(len(self.frame), 30) + + exp = np.ravel(np.column_stack((series.values, [np.nan] * 15))) + exp = Series(exp, index=self.frame.index, name='col5') + tm.assert_series_equal(self.frame['col5'], exp) series = self.frame['A'] self.frame['col6'] = series - tm.assert_dict_equal(series, self.frame['col6'], - compare_keys=False) + tm.assert_series_equal(series, self.frame['col6'], check_names=False) with tm.assertRaises(KeyError): self.frame[randn(len(self.frame) + 1)] = 1 diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index cd2a0fbeefae3..7dfada0d868fe 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -724,9 +724,14 @@ def test_combineFrame(self): frame_copy['C'][:5] = nan added = self.frame + frame_copy - tm.assert_dict_equal(added['A'].valid(), - self.frame['A'] * 2, - compare_keys=False) + + indexer = added['A'].valid().index + exp = (self.frame['A'] * 2).copy() + + tm.assert_series_equal(added['A'].valid(), exp.loc[indexer]) + + exp.loc[~exp.index.isin(indexer)] = np.nan + tm.assert_series_equal(added['A'], exp.loc[added['A'].index]) self.assertTrue( np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 820076e2c6fd5..b9baae6cbeda7 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -120,13 +120,13 @@ def test_pct_change_shift_over_nas(self): def test_shift(self): # naive shift shiftedFrame = self.tsframe.shift(5) - self.assertTrue(shiftedFrame.index.equals(self.tsframe.index)) + self.assert_index_equal(shiftedFrame.index, self.tsframe.index) shiftedSeries = self.tsframe['A'].shift(5) assert_series_equal(shiftedFrame['A'], shiftedSeries) shiftedFrame = self.tsframe.shift(-5) - self.assertTrue(shiftedFrame.index.equals(self.tsframe.index)) + self.assert_index_equal(shiftedFrame.index, self.tsframe.index) shiftedSeries = self.tsframe['A'].shift(-5) assert_series_equal(shiftedFrame['A'], shiftedSeries) @@ -154,10 +154,10 @@ def test_shift(self): ps = tm.makePeriodFrame() shifted = ps.shift(1) unshifted = shifted.shift(-1) - self.assertTrue(shifted.index.equals(ps.index)) - - tm.assert_dict_equal(unshifted.ix[:, 0].valid(), ps.ix[:, 0], - compare_keys=False) + self.assert_index_equal(shifted.index, ps.index) + self.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal(unshifted.ix[:, 0].valid().values, + ps.ix[:-1, 0].values) shifted2 = ps.shift(1, 'B') shifted3 = ps.shift(1, datetools.bday) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 002b7fa3aa8df..878a639a25aa5 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1279,6 +1279,7 @@ def test_idxmax(self): self.assertEqual(result, 1.1) def test_numpy_argmax(self): + # argmax is aliased to idxmax data = np.random.randint(0, 11, size=10) result = np.argmax(Series(data)) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 72f1cac219998..48224c7bfbd63 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -65,8 +65,9 @@ def test_combine_first(self): combined = strings.combine_first(floats) - tm.assert_dict_equal(strings, combined, compare_keys=False) - tm.assert_dict_equal(floats[1::2], combined, compare_keys=False) + tm.assert_series_equal(strings, combined.loc[index[::2]]) + tm.assert_series_equal(floats[1::2].astype(object), + combined.loc[index[1::2]]) # corner case s = Series([1., 2, 3], index=[0, 1, 2]) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index dec4f878d7d56..e27a21e6d5903 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -433,8 +433,8 @@ def test_valid(self): result = ts.valid() self.assertEqual(len(result), ts.count()) - - tm.assert_dict_equal(result, ts, compare_keys=False) + tm.assert_series_equal(result, ts[1::2]) + tm.assert_series_equal(result, ts[pd.notnull(ts)]) def test_isnull(self): ser = Series([0, 5.4, 3, nan, -0.001]) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index de62fb4ab6f07..463063016f1e9 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -25,7 +25,10 @@ def test_shift(self): shifted = self.ts.shift(1) unshifted = shifted.shift(-1) - tm.assert_dict_equal(unshifted.valid(), self.ts, compare_keys=False) + tm.assert_index_equal(shifted.index, self.ts.index) + tm.assert_index_equal(unshifted.index, self.ts.index) + tm.assert_numpy_array_equal(unshifted.valid().values, + self.ts.values[:-1]) offset = datetools.bday shifted = self.ts.shift(1, freq=offset) @@ -49,7 +52,9 @@ def test_shift(self): ps = tm.makePeriodSeries() shifted = ps.shift(1) unshifted = shifted.shift(-1) - tm.assert_dict_equal(unshifted.valid(), ps, compare_keys=False) + tm.assert_index_equal(shifted.index, ps.index) + tm.assert_index_equal(unshifted.index, ps.index) + tm.assert_numpy_array_equal(unshifted.valid().values, ps.values[:-1]) shifted2 = ps.shift(1, 'B') shifted3 = ps.shift(1, datetools.bday) @@ -77,16 +82,16 @@ def test_shift(self): # xref 8260 # with tz - s = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + s = Series(date_range('2000-01-01 09:00:00', periods=5, + tz='US/Eastern'), name='foo') result = s - s.shift() - assert_series_equal(result, Series( - TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo')) + + exp = Series(TimedeltaIndex(['NaT'] + ['1 days'] * 4), name='foo') + assert_series_equal(result, exp) # incompat tz - s2 = Series( - date_range('2000-01-01 09:00:00', periods=5, tz='CET'), name='foo') + s2 = Series(date_range('2000-01-01 09:00:00', periods=5, + tz='CET'), name='foo') self.assertRaises(ValueError, lambda: s - s2) def test_tshift(self): diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 0f58d17f0ade4..e52afa74d95e2 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -189,8 +189,17 @@ def __contains__(self, key): return False def __getitem__(self, key): + """ + This getitem defers to the underlying array, which by-definition can + only handle list-likes, slices, and integer scalars + """ + + is_int = is_integer(key) + if lib.isscalar(key) and not is_int: + raise ValueError + getitem = self._data.__getitem__ - if lib.isscalar(key): + if is_int: val = getitem(key) return self._box_func(val) else: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 39b4cca85ad9c..dd66d732ba684 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -131,7 +131,13 @@ def assert_almost_equal(left, right, check_exact=False, **kwargs): return _testing.assert_almost_equal(left, right, **kwargs) -assert_dict_equal = _testing.assert_dict_equal +def assert_dict_equal(left, right, compare_keys=True): + + # instance validation + assertIsInstance(left, dict, '[dict] ') + assertIsInstance(right, dict, '[dict] ') + + return _testing.assert_dict_equal(left, right, compare_keys=compare_keys) def randbool(size=(), p=0.5): diff --git a/pandas/util/validators.py b/pandas/util/validators.py index 2166dc45db605..bbfd24df9c13e 100644 --- a/pandas/util/validators.py +++ b/pandas/util/validators.py @@ -42,7 +42,16 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): # as comparison may have been overriden for the left # hand object try: - match = (arg_val_dict[key] == compat_args[key]) + v1 = arg_val_dict[key] + v2 = compat_args[key] + + # check for None-ness otherwise we could end up + # comparing a numpy array vs None + if (v1 is not None and v2 is None) or \ + (v1 is None and v2 is not None): + match = False + else: + match = (v1 == v2) if not is_bool(match): raise ValueError("'match' is not a boolean") From 75714defbe848fb8be745687b8e64f02c108bb12 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 24 May 2016 09:22:56 -0400 Subject: [PATCH 46/96] BUG: remove_unused_categories dtype coerces to int64 Author: sinhrks Closes #13261 from sinhrks/categorical_remove_dtype and squashes the following commits: b5cbe2c [sinhrks] BUG: remove_unused_categories dtype coerces to int64 --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/core/categorical.py | 2 +- pandas/tests/test_categorical.py | 21 ++++++++++++--------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2854dbf5e655b..a26c46b6123a0 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -248,3 +248,5 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f0a83cbe77d3c..fa3d13c174245 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -898,8 +898,8 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._codes = inv cat._categories = cat.categories.take(idx) + cat._codes = _coerce_indexer_dtype(inv, self._categories) if not inplace: return cat diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 40ef5354e91bd..5a0d079efb4c2 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1022,14 +1022,14 @@ def f(): def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) - exp_categories_all = np.array(["a", "b", "c", "d", "e"]) - exp_categories_dropped = np.array(["a", "b", "c", "d"]) + exp_categories_all = Index(["a", "b", "c", "d", "e"]) + exp_categories_dropped = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, exp_categories_dropped) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) self.assert_numpy_array_equal(c.categories, exp_categories_dropped) @@ -1039,15 +1039,18 @@ def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "c"])) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(res.categories, + Index(np.array(["a", "b", "c"]))) + exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(res.codes, exp_codes) + self.assert_index_equal(c.categories, exp_categories_all) val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] cat = pd.Categorical(values=val, categories=list('ABCDEFG')) out = cat.remove_unused_categories() - self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F']) - self.assert_numpy_array_equal(out.codes, [2, -1, 1, 0, 1, 2, -1]) + self.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) + exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) + self.assert_numpy_array_equal(out.codes, exp_codes) self.assertEqual(out.get_values().tolist(), val) alpha = list('abcdefghijklmnopqrstuvwxyz') From 69ad08b0273ddc3d117690bd8746eecec6ab29ac Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 May 2016 11:25:52 -0400 Subject: [PATCH 47/96] BUG: Bug in selection from a HDFStore with a fixed format and start and/or stop will now return the selected range closes #8287 Author: Jeff Reback Closes #13267 from jreback/stop and squashes the following commits: 39faa23 [Jeff Reback] BUG: Bug in selection from a HDFStore with a fixed format and start and/or stop specified will now return the selected range --- doc/source/whatsnew/v0.18.2.txt | 2 + pandas/io/pytables.py | 120 ++++++++++++++++++++----------- pandas/io/tests/test_pytables.py | 54 +++++++++++++- 3 files changed, 132 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index a26c46b6123a0..ee2761b79b620 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -79,6 +79,7 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) + .. _whatsnew_0182.api: API changes @@ -207,6 +208,7 @@ Bug Fixes - Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) +- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d350358081aa7..fcf5125d956c6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1314,12 +1314,20 @@ def __init__(self, store, s, func, where, nrows, start=None, stop=None, self.s = s self.func = func self.where = where - self.nrows = nrows or 0 - self.start = start or 0 - if stop is None: - stop = self.nrows - self.stop = min(self.nrows, stop) + # set start/stop if they are not set if we are a table + if self.s.is_table: + if nrows is None: + nrows = 0 + if start is None: + start = 0 + if stop is None: + stop = nrows + stop = min(nrows, stop) + + self.nrows = nrows + self.start = start + self.stop = stop self.coordinates = None if iterator or chunksize is not None: @@ -2303,14 +2311,23 @@ def f(values, freq=None, tz=None): return klass def validate_read(self, kwargs): - if kwargs.get('columns') is not None: + """ + remove table keywords from kwargs and return + raise if any keywords are passed which are not-None + """ + kwargs = copy.copy(kwargs) + + columns = kwargs.pop('columns', None) + if columns is not None: raise TypeError("cannot pass a column specification when reading " "a Fixed format store. this store must be " "selected in its entirety") - if kwargs.get('where') is not None: + where = kwargs.pop('where', None) + if where is not None: raise TypeError("cannot pass a where specification when reading " "from a Fixed format store. this store must be " "selected in its entirety") + return kwargs @property def is_exists(self): @@ -2329,11 +2346,11 @@ def get_attrs(self): def write(self, obj, **kwargs): self.set_attrs() - def read_array(self, key): + def read_array(self, key, start=None, stop=None): """ read an array for the specified node (off of group """ import tables node = getattr(self.group, key) - data = node[:] + data = node[start:stop] attrs = node._v_attrs transposed = getattr(attrs, 'transposed', False) @@ -2363,17 +2380,17 @@ def read_array(self, key): else: return ret - def read_index(self, key): + def read_index(self, key, **kwargs): variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key)) if variety == u('multi'): - return self.read_multi_index(key) + return self.read_multi_index(key, **kwargs) elif variety == u('block'): - return self.read_block_index(key) + return self.read_block_index(key, **kwargs) elif variety == u('sparseint'): - return self.read_sparse_intindex(key) + return self.read_sparse_intindex(key, **kwargs) elif variety == u('regular'): - _, index = self.read_index_node(getattr(self.group, key)) + _, index = self.read_index_node(getattr(self.group, key), **kwargs) return index else: # pragma: no cover raise TypeError('unrecognized index variety: %s' % variety) @@ -2411,19 +2428,19 @@ def write_block_index(self, key, index): self.write_array('%s_blengths' % key, index.blengths) setattr(self.attrs, '%s_length' % key, index.length) - def read_block_index(self, key): + def read_block_index(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - blocs = self.read_array('%s_blocs' % key) - blengths = self.read_array('%s_blengths' % key) + blocs = self.read_array('%s_blocs' % key, **kwargs) + blengths = self.read_array('%s_blengths' % key, **kwargs) return BlockIndex(length, blocs, blengths) def write_sparse_intindex(self, key, index): self.write_array('%s_indices' % key, index.indices) setattr(self.attrs, '%s_length' % key, index.length) - def read_sparse_intindex(self, key): + def read_sparse_intindex(self, key, **kwargs): length = getattr(self.attrs, '%s_length' % key) - indices = self.read_array('%s_indices' % key) + indices = self.read_array('%s_indices' % key, **kwargs) return IntIndex(length, indices) def write_multi_index(self, key, index): @@ -2448,7 +2465,7 @@ def write_multi_index(self, key, index): label_key = '%s_label%d' % (key, i) self.write_array(label_key, lab) - def read_multi_index(self, key): + def read_multi_index(self, key, **kwargs): nlevels = getattr(self.attrs, '%s_nlevels' % key) levels = [] @@ -2456,19 +2473,20 @@ def read_multi_index(self, key): names = [] for i in range(nlevels): level_key = '%s_level%d' % (key, i) - name, lev = self.read_index_node(getattr(self.group, level_key)) + name, lev = self.read_index_node(getattr(self.group, level_key), + **kwargs) levels.append(lev) names.append(name) label_key = '%s_label%d' % (key, i) - lab = self.read_array(label_key) + lab = self.read_array(label_key, **kwargs) labels.append(lab) return MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=True) - def read_index_node(self, node): - data = node[:] + def read_index_node(self, node, start=None, stop=None): + data = node[start:stop] # If the index was an empty array write_array_empty() will # have written a sentinel. Here we relace it with the original. if ('shape' in node._v_attrs and @@ -2607,9 +2625,9 @@ def write_array(self, key, value, items=None): class LegacyFixed(GenericFixed): - def read_index_legacy(self, key): + def read_index_legacy(self, key, start=None, stop=None): node = getattr(self.group, key) - data = node[:] + data = node[start:stop] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) @@ -2617,7 +2635,7 @@ def read_index_legacy(self, key): class LegacySeriesFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') values = self.read_array('values') return Series(values, index=index) @@ -2626,7 +2644,7 @@ def read(self, **kwargs): class LegacyFrameFixed(LegacyFixed): def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index_legacy('index') columns = self.read_index_legacy('columns') values = self.read_array('values') @@ -2645,9 +2663,9 @@ def shape(self): return None def read(self, **kwargs): - self.validate_read(kwargs) - index = self.read_index('index') - values = self.read_array('values') + kwargs = self.validate_read(kwargs) + index = self.read_index('index', **kwargs) + values = self.read_array('values', **kwargs) return Series(values, index=index, name=self.name) def write(self, obj, **kwargs): @@ -2657,12 +2675,25 @@ def write(self, obj, **kwargs): self.attrs.name = obj.name -class SparseSeriesFixed(GenericFixed): +class SparseFixed(GenericFixed): + + def validate_read(self, kwargs): + """ + we don't support start, stop kwds in Sparse + """ + kwargs = super(SparseFixed, self).validate_read(kwargs) + if 'start' in kwargs or 'stop' in kwargs: + raise NotImplementedError("start and/or stop are not supported " + "in fixed Sparse reading") + return kwargs + + +class SparseSeriesFixed(SparseFixed): pandas_kind = u('sparse_series') attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) index = self.read_index('index') sp_values = self.read_array('sp_values') sp_index = self.read_index('sp_index') @@ -2681,12 +2712,12 @@ def write(self, obj, **kwargs): self.attrs.kind = obj.kind -class SparseFrameFixed(GenericFixed): +class SparseFrameFixed(SparseFixed): pandas_kind = u('sparse_frame') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) columns = self.read_index('columns') sdict = {} for c in columns: @@ -2714,12 +2745,12 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) -class SparsePanelFixed(GenericFixed): +class SparsePanelFixed(SparseFixed): pandas_kind = u('sparse_panel') attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): - self.validate_read(kwargs) + kwargs = self.validate_read(kwargs) items = self.read_index('items') sdict = {} @@ -2782,19 +2813,26 @@ def shape(self): except: return None - def read(self, **kwargs): - self.validate_read(kwargs) + def read(self, start=None, stop=None, **kwargs): + # start, stop applied to rows, so 0th axis only + + kwargs = self.validate_read(kwargs) + select_axis = self.obj_type()._get_block_manager_axis(0) axes = [] for i in range(self.ndim): - ax = self.read_index('axis%d' % i) + + _start, _stop = (start, stop) if i == select_axis else (None, None) + ax = self.read_index('axis%d' % i, start=_start, stop=_stop) axes.append(ax) items = axes[0] blocks = [] for i in range(self.nblocks): + blk_items = self.read_index('block%d_items' % i) - values = self.read_array('block%d_values' % i) + values = self.read_array('block%d_values' % i, + start=_start, stop=_stop) blk = make_block(values, placement=items.get_indexer(blk_items)) blocks.append(blk) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 5ee84ce97979a..4c72a47dbdf6e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4128,10 +4128,11 @@ def test_nan_selection_bug_4858(self): result = store.select('df', where='values>2.0') assert_frame_equal(result, expected) - def test_start_stop(self): + def test_start_stop_table(self): with ensure_clean_store(self.path) as store: + # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) store.append('df', df) @@ -4143,8 +4144,55 @@ def test_start_stop(self): # out of range result = store.select( 'df', [Term("columns=['A']")], start=30, stop=40) - assert(len(result) == 0) - assert(type(result) == DataFrame) + self.assertTrue(len(result) == 0) + expected = df.ix[30:40, ['A']] + tm.assert_frame_equal(result, expected) + + def test_start_stop_fixed(self): + + with ensure_clean_store(self.path) as store: + + # fixed, GH 8287 + df = DataFrame(dict(A=np.random.rand(20), + B=np.random.rand(20)), + index=pd.date_range('20130101', periods=20)) + store.put('df', df) + + result = store.select( + 'df', start=0, stop=5) + expected = df.iloc[0:5, :] + tm.assert_frame_equal(result, expected) + + result = store.select( + 'df', start=5, stop=10) + expected = df.iloc[5:10, :] + tm.assert_frame_equal(result, expected) + + # out of range + result = store.select( + 'df', start=30, stop=40) + expected = df.iloc[30:40, :] + tm.assert_frame_equal(result, expected) + + # series + s = df.A + store.put('s', s) + result = store.select('s', start=0, stop=5) + expected = s.iloc[0:5] + tm.assert_series_equal(result, expected) + + result = store.select('s', start=5, stop=10) + expected = s.iloc[5:10] + tm.assert_series_equal(result, expected) + + # sparse; not implemented + df = tm.makeDataFrame() + df.ix[3:5, 1:3] = np.nan + df.ix[8:10, -2] = np.nan + dfs = df.to_sparse() + store.put('dfs', dfs) + with self.assertRaises(NotImplementedError): + store.select('dfs', start=0, stop=5) def test_select_filter_corner(self): From e0a2e3bc51f9e178f72c44e6de06700ee0bf31c6 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Tue, 24 May 2016 11:34:33 -0400 Subject: [PATCH 48/96] DOC: fixed typos in GroupBy document Author: Mortada Mehyar Closes #13270 from mortada/docs and squashes the following commits: a3dea39 [Mortada Mehyar] DOC: fixed typos in GroupBy document --- doc/source/groupby.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 4cde1fed344a8..02309fe5d6509 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -52,7 +52,7 @@ following: step and try to return a sensibly combined result if it doesn't fit into either of the above two categories -Since the set of object instance method on pandas data structures are generally +Since the set of object instance methods on pandas data structures are generally rich and expressive, we often simply want to invoke, say, a DataFrame function on each group. The name GroupBy should be quite familiar to those who have used a SQL-based tool (or ``itertools``), in which you can write code like: @@ -129,7 +129,7 @@ columns: In [5]: grouped = df.groupby(get_letter_type, axis=1) -Starting with 0.8, pandas Index objects now supports duplicate values. If a +Starting with 0.8, pandas Index objects now support duplicate values. If a non-unique index is used as the group key in a groupby operation, all values for the same index value will be considered to be in one group and thus the output of aggregation functions will only contain unique index values: @@ -171,7 +171,8 @@ By default the group keys are sorted during the ``groupby`` operation. You may h df2.groupby(['X'], sort=False).sum() -Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. For example, the groups created by ``groupby()`` below are in the order the appeared in the original ``DataFrame``: +Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. +For example, the groups created by ``groupby()`` below are in the order they appeared in the original ``DataFrame``: .. ipython:: python @@ -254,7 +255,7 @@ GroupBy with MultiIndex With :ref:`hierarchically-indexed data `, it's quite natural to group by one of the levels of the hierarchy. -Let's create a series with a two-level ``MultiIndex``. +Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python @@ -636,7 +637,7 @@ with NaNs. dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) -For dataframes with multiple columns, filters should explicitly specify a column as the filter criterion. +For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python @@ -755,7 +756,7 @@ The dimension of the returned result can also change: .. note:: - ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to apply. + ``apply`` can act as a reducer, transformer, *or* filter function, depending on exactly what is passed to it. So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. @@ -789,7 +790,7 @@ Again consider the example DataFrame we've been looking at: df -Supposed we wished to compute the standard deviation grouped by the ``A`` +Suppose we wish to compute the standard deviation grouped by the ``A`` column. There is a slight problem, namely that we don't care about the data in column ``B``. We refer to this as a "nuisance" column. If the passed aggregation function can't be applied to some columns, the troublesome columns @@ -1019,7 +1020,7 @@ Returning a Series to propagate names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Group DataFrame columns, compute a set of metrics and return a named Series. -The Series name is used as the name for the column index. This is especially +The Series name is used as the name for the column index. This is especially useful in conjunction with reshaping operations such as stacking in which the column index name will be used as the name of the inserted column: From b638f18469725eaad3dfe82dd7b01e285010a990 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 25 May 2016 08:08:46 -0400 Subject: [PATCH 49/96] BUG: Properly validate and parse nrows in read_csv 1) Allows `float` values for `nrows` for the Python engine 2) Prevents abuse of the `nrows` argument for the CParser (e.g. you can passing `nrows=1.2`) Closes #10476. Author: gfyoung Closes #13275 from gfyoung/nrows-bug-validate and squashes the following commits: d856051 [gfyoung] BUG: Properly validate and parse nrows in read_csv --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 24 ++++++++++++++++++++-- pandas/io/tests/parser/common.py | 20 ++++++++++++------ pandas/io/tests/parser/test_unsupported.py | 9 ++++++++ 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index ee2761b79b620..c9d267c05d370 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -252,3 +252,4 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c939864d7a38b..95a7f63075167 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -272,6 +272,26 @@ """ % (_parser_params % (_fwf_widths, '')) +def _validate_nrows(nrows): + """ + Checks whether the 'nrows' parameter for parsing is either + an integer OR float that can SAFELY be cast to an integer + without losing accuracy. Raises a ValueError if that is + not the case. + """ + msg = "'nrows' must be an integer" + + if nrows is not None: + if com.is_float(nrows): + if int(nrows) != nrows: + raise ValueError(msg) + nrows = int(nrows) + elif not com.is_integer(nrows): + raise ValueError(msg) + + return nrows + + def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) @@ -311,14 +331,14 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) - nrows = kwds.pop('nrows', None) chunksize = kwds.get('chunksize', None) + nrows = _validate_nrows(kwds.pop('nrows', None)) # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) if (nrows is not None) and (chunksize is not None): - raise NotImplementedError("'nrows' and 'chunksize' can not be used" + raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: return parser.read(nrows) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 90a0b420eed3c..8c4bf3644127e 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -391,10 +391,23 @@ def test_int_conversion(self): self.assertEqual(data['B'].dtype, np.int64) def test_read_nrows(self): - df = self.read_csv(StringIO(self.data1), nrows=3) expected = self.read_csv(StringIO(self.data1))[:3] + + df = self.read_csv(StringIO(self.data1), nrows=3) tm.assert_frame_equal(df, expected) + # see gh-10476 + df = self.read_csv(StringIO(self.data1), nrows=3.0) + tm.assert_frame_equal(df, expected) + + msg = "must be an integer" + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), nrows=1.2) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(self.data1), nrows='foo') + def test_read_chunksize(self): reader = self.read_csv(StringIO(self.data1), index_col=0, chunksize=2) df = self.read_csv(StringIO(self.data1), index_col=0) @@ -815,11 +828,6 @@ def test_ignore_leading_whitespace(self): expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected) - def test_nrows_and_chunksize_raises_notimplemented(self): - data = 'a b c' - self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), - nrows=10, chunksize=5) - def test_chunk_begins_with_newline_whitespace(self): # see gh-10022 data = '\n hello\nworld\n' diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index cefe7d939d1ab..3c1c45831e7b4 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -30,6 +30,15 @@ def test_mangle_dupe_cols_false(self): read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) + def test_nrows_and_chunksize(self): + data = 'a b c' + msg = "cannot be used together yet" + + for engine in ('c', 'python'): + with tm.assertRaisesRegexp(NotImplementedError, msg): + read_csv(StringIO(data), engine=engine, + nrows=10, chunksize=5) + def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3' From 87492737f2f81183700849d453c38d507f128811 Mon Sep 17 00:00:00 2001 From: Roy Keyes Date: Tue, 24 May 2016 22:42:52 -0700 Subject: [PATCH 50/96] BUG: Fix for resampler for grouping kwarg bug closes #13235 closes #13241 --- doc/source/whatsnew/v0.18.2.txt | 8 ++++++-- pandas/tseries/resample.py | 3 +-- pandas/tseries/tests/test_resample.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index c9d267c05d370..ebae54f292e3c 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -224,9 +224,11 @@ Bug Fixes - Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) - Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) +- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) +- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) @@ -251,5 +253,7 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) + + + - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) -- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 90ec5d19db590..8d6955ab43711 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -912,8 +912,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, return _maybe_process_deprecations(r, how=how, fill_method=fill_method, - limit=limit, - **kwargs) + limit=limit) class TimeGrouper(Grouper): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 8e6341c6b7cc3..6b94c828bddc0 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -2670,6 +2670,26 @@ def f(x): result = g.apply(f) assert_frame_equal(result, expected) + def test_resample_groupby_with_label(self): + # GH 13235 + index = date_range('2000-01-01', freq='2D', periods=5) + df = DataFrame(index=index, + data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]} + ) + result = df.groupby('col0').resample('1W', label='left').sum() + + mi = [np.array([0, 0, 1, 2]), + pd.to_datetime(np.array(['1999-12-26', '2000-01-02', + '2000-01-02', '2000-01-02']) + ) + ] + mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None]) + expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]}, + index=mindex + ) + + assert_frame_equal(result, expected) + def test_consistency_with_window(self): # consistent return values with window From da5fc17fe9bee7527fb89b4540dc0c40555aaf16 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 25 May 2016 13:27:28 -0400 Subject: [PATCH 51/96] BUG, ENH: Improve infinity parsing for read_csv 1) Allow mixed-case infinity strings for the Python engine 2) Interpret `+inf` as positive infinity for both engines Author: gfyoung Closes #13274 from gfyoung/floatify-infinity-parsing and squashes the following commits: f37b130 [gfyoung] BUG, ENH: Improve infinity parsing in read_csv --- doc/source/whatsnew/v0.18.2.txt | 2 ++ pandas/io/tests/parser/c_parser_only.py | 22 -------------- pandas/io/tests/parser/common.py | 24 +++++++++++++++ pandas/parser.pyx | 5 ++-- pandas/src/parse_helper.h | 33 +++++++++++++++------ pandas/tests/test_lib.py | 39 +++++++++++++++++++++++++ 6 files changed, 92 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index ebae54f292e3c..004e2dcc20084 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -78,6 +78,7 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) .. _whatsnew_0182.api: @@ -237,6 +238,7 @@ Bug Fixes +- Bug in ``pd.read_csv()`` with ``engine=='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 325418f87af6a..aeee77bb02e98 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -447,25 +447,3 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) - - def test_inf_parsing(self): - data = """\ -,A -a,inf -b,-inf -c,Inf -d,-Inf -e,INF -f,-INF -g,INf -h,-INf -i,inF -j,-inF""" - inf = float('inf') - expected = Series([inf, -inf] * 5) - - df = self.read_csv(StringIO(data), index_col=0) - tm.assert_almost_equal(df['A'].values, expected.values) - - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 8c4bf3644127e..3912bbbf11e53 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1300,3 +1300,27 @@ def test_read_duplicate_names(self): expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=['a', 'b', 'a.1']) tm.assert_frame_equal(df, expected) + + def test_inf_parsing(self): + data = """\ +,A +a,inf +b,-inf +c,+Inf +d,-Inf +e,INF +f,-INF +g,+INf +h,-INf +i,inF +j,-inF""" + inf = float('inf') + expected = Series([inf, -inf] * 5) + + df = self.read_csv(StringIO(data), index_col=0) + tm.assert_almost_equal(df['A'].values, expected.values) + + if self.engine == 'c': + # TODO: remove condition when 'na_filter' is supported for Python + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 94d7f36f4f205..729e5af528b80 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1501,6 +1501,7 @@ cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, data += width cdef char* cinf = b'inf' +cdef char* cposinf = b'+inf' cdef char* cneginf = b'-inf' cdef _try_double(parser_t *parser, int col, int line_start, int line_end, @@ -1562,7 +1563,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF @@ -1581,7 +1582,7 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0: + if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index d47e448700029..fd5089dd8963d 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -1,5 +1,6 @@ #include #include +#include "headers/portable.h" static double xstrtod(const char *p, char **q, char decimal, char sci, int skip_trailing, int *maybe_int); @@ -39,22 +40,36 @@ int floatify(PyObject* str, double *result, int *maybe_int) { if (!status) { /* handle inf/-inf */ - if (0 == strcmp(data, "-inf")) { - *result = -HUGE_VAL; - *maybe_int = 0; - } else if (0 == strcmp(data, "inf")) { - *result = HUGE_VAL; - *maybe_int = 0; + if (strlen(data) == 3) { + if (0 == strcasecmp(data, "inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 4) { + if (0 == strcasecmp(data, "-inf")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+inf")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { - PyErr_SetString(PyExc_ValueError, "Unable to parse string"); - Py_XDECREF(tmp); - return -1; + goto parsingerror; } } Py_XDECREF(tmp); return 0; +parsingerror: + PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + Py_XDECREF(tmp); + return -1; + /* #if PY_VERSION_HEX >= 0x03000000 return PyFloat_FromString(str); diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 6912e3a7ff68c..2aa31063df446 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -188,6 +188,45 @@ def test_isinf_scalar(self): self.assertFalse(lib.isneginf_scalar(1)) self.assertFalse(lib.isneginf_scalar('a')) + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + class Testisscalar(tm.TestCase): From b4e2d34edcbc404f6c90f76b67bcc5fe26f0945f Mon Sep 17 00:00:00 2001 From: Neil Parley Date: Wed, 25 May 2016 13:34:27 -0400 Subject: [PATCH 52/96] TST: Remove imp and just use importlib to avoid memory error when showing versions closes #13282 Author: Neil Parley Closes #13284 from nparley/imp and squashes the following commits: 29b98f7 [Neil Parley] Move import to top ca5fa7a [Neil Parley] Remove imp and just use importlib to avoid memory error when importing blosc. --- pandas/util/print_versions.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 115423f3e3e22..e74568f39418c 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -4,6 +4,7 @@ import struct import subprocess import codecs +import importlib def get_sys_info(): @@ -55,7 +56,6 @@ def get_sys_info(): def show_versions(as_json=False): - import imp sys_info = get_sys_info() deps = [ @@ -99,11 +99,7 @@ def show_versions(as_json=False): deps_blob = list() for (modname, ver_f) in deps: try: - try: - mod = imp.load_module(modname, *imp.find_module(modname)) - except (ImportError): - import importlib - mod = importlib.import_module(modname) + mod = importlib.import_module(modname) ver = ver_f(mod) deps_blob.append((modname, ver)) except: From f2ce0ac6ecd31d9bf48366ecac293b092279c174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Thu, 26 May 2016 08:36:24 -0400 Subject: [PATCH 53/96] ERR: error in datetime conversion with non-convertibles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #11853 Author: Gábor Lipták Closes #13176 from gliptak/dtbool1 and squashes the following commits: 5179d1d [Gábor Lipták] Bug in pd.to_datetime when passing bools; will now respect the errors value --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/tseries/tests/test_timeseries.py | 27 +++++++++++++++++++++++++ pandas/tslib.pyx | 17 ++++++++++++++-- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 004e2dcc20084..d7918152ad0d9 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -245,6 +245,7 @@ Bug Fixes - Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) +- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 50cf38be62779..1564c0a81585e 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2292,6 +2292,33 @@ def test_to_datetime_tz_psycopg2(self): dtype='datetime64[ns, UTC]') tm.assert_index_equal(result, expected) + def test_datetime_bool(self): + # GH13176 + with self.assertRaises(TypeError): + to_datetime(False) + self.assertTrue(to_datetime(False, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(False, errors="ignore"), False) + with self.assertRaises(TypeError): + to_datetime(True) + self.assertTrue(to_datetime(True, errors="coerce") is tslib.NaT) + self.assertEqual(to_datetime(True, errors="ignore"), True) + with self.assertRaises(TypeError): + to_datetime([False, datetime.today()]) + with self.assertRaises(TypeError): + to_datetime(['20130101', True]) + tm.assert_index_equal(to_datetime([0, False, tslib.NaT, 0.0], + errors="coerce"), + DatetimeIndex([to_datetime(0), tslib.NaT, + tslib.NaT, to_datetime(0)])) + + def test_datetime_invalid_datatype(self): + # GH13176 + + with self.assertRaises(TypeError): + pd.to_datetime(bool) + with self.assertRaises(TypeError): + pd.to_datetime(pd.to_datetime) + def test_unit(self): # GH 11758 # test proper behavior with erros diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index f5301d3746e8b..b3fb4989b2f23 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2220,8 +2220,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult = result.view('i8') for i in range(n): val = values[i] + if _checknull_with_nat(val): iresult[i] = NPY_NAT + elif PyDateTime_Check(val): seen_datetime=1 if val.tzinfo is not None: @@ -2250,6 +2252,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + elif PyDate_Check(val): iresult[i] = _date_to_datetime64(val, &dts) try: @@ -2260,6 +2263,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + elif util.is_datetime64_object(val): if get_datetime64_value(val) == NPY_NAT: iresult[i] = NPY_NAT @@ -2273,8 +2277,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', continue raise - # these must be ns unit by-definition elif is_integer_object(val) or is_float_object(val): + # these must be ns unit by-definition if val != val or val == NPY_NAT: iresult[i] = NPY_NAT @@ -2292,7 +2296,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = cast_from_unit(val, 'ns') except: iresult[i] = NPY_NAT - else: + + elif util.is_string_object(val): + # string + try: if len(val) == 0 or val in _nat_strings: iresult[i] = NPY_NAT @@ -2340,6 +2347,12 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue raise + else: + if is_coerce: + iresult[i] = NPY_NAT + else: + raise TypeError("{0} is not convertible to datetime" + .format(type(val))) if seen_datetime and seen_integer: # we have mixed datetimes & integers From 57ea76fb9a1d0b23943c700a6129d37de6df6adc Mon Sep 17 00:00:00 2001 From: Eduardo Blancas Reyes Date: Thu, 26 May 2016 08:52:07 -0400 Subject: [PATCH 54/96] DOC: Improved documentation for DataFrame.join closes #12188 Author: Eduardo Blancas Reyes Closes #12193 from edublancas/master and squashes the following commits: a66f2ea [Eduardo Blancas Reyes] DOC: improves DataFrame.join documentation 8266cdc [Eduardo Blancas Reyes] DOC: improves DataFrame.join documentation --- doc/source/merging.rst | 6 +-- pandas/core/frame.py | 91 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 13 deletions(-) diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 7908428135308..ba675d9aac830 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -562,10 +562,8 @@ DataFrame instance method, with the calling DataFrame being implicitly considered the left object in the join. The related ``DataFrame.join`` method, uses ``merge`` internally for the -index-on-index and index-on-column(s) joins, but *joins on indexes* by default -rather than trying to join on common columns (the default behavior for -``merge``). If you are joining on index, you may wish to use ``DataFrame.join`` -to save yourself some typing. +index-on-index (by default) and column(s)-on-index join. If you are joining on +index only, you may wish to use ``DataFrame.join`` to save yourself some typing. Brief primer on merge methods (relational algebra) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3d01d12c9336..2c8106571f198 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4351,18 +4351,20 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', Series is passed, its name attribute must be set, and that will be used as the column name in the resulting joined DataFrame on : column name, tuple/list of column names, or array-like - Column(s) to use for joining, otherwise join on index. If multiples + Column(s) in the caller to join on the index in other, + otherwise joins index-on-index. If multiples columns given, the passed DataFrame must have a MultiIndex. Can pass an array as the join key if not already contained in the calling DataFrame. Like an Excel VLOOKUP operation - how : {'left', 'right', 'outer', 'inner'} - How to handle indexes of the two objects. Default: 'left' - for joining on index, None otherwise - - * left: use calling frame's index - * right: use input frame's index - * outer: form union of indexes - * inner: use intersection of indexes + how : {'left', 'right', 'outer', 'inner'}, default: 'left' + How to handle the operation of the two objects. + + * left: use calling frame's index (or column if on is specified) + * right: use other frame's index + * outer: form union of calling frame's index (or column if on is + specified) with other frame's index + * inner: form intersection of calling frame's index (or column if + on is specified) with other frame's index lsuffix : string Suffix to use from left frame's overlapping columns rsuffix : string @@ -4376,6 +4378,77 @@ def join(self, other, on=None, how='left', lsuffix='', rsuffix='', on, lsuffix, and rsuffix options are not supported when passing a list of DataFrame objects + Examples + -------- + >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], + ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) + + >>> caller + A key + 0 A0 K0 + 1 A1 K1 + 2 A2 K2 + 3 A3 K3 + 4 A4 K4 + 5 A5 K5 + + >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], + ... 'B': ['B0', 'B1', 'B2']}) + + >>> other + B key + 0 B0 K0 + 1 B1 K1 + 2 B2 K2 + + Join DataFrames using their indexes. + + >>> caller.join(other, lsuffix='_caller', rsuffix='_other') + + >>> A key_caller B key_other + 0 A0 K0 B0 K0 + 1 A1 K1 B1 K1 + 2 A2 K2 B2 K2 + 3 A3 K3 NaN NaN + 4 A4 K4 NaN NaN + 5 A5 K5 NaN NaN + + + If we want to join using the key columns, we need to set key to be + the index in both caller and other. The joined DataFrame will have + key as its index. + + >>> caller.set_index('key').join(other.set_index('key')) + + >>> A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the on + parameter. DataFrame.join always uses other's index but we can use any + column in the caller. This method preserves the original caller's + index in the result. + + >>> caller.join(other.set_index('key'), on='key') + + >>> A key B + 0 A0 K0 B0 + 1 A1 K1 B1 + 2 A2 K2 B2 + 3 A3 K3 NaN + 4 A4 K4 NaN + 5 A5 K5 NaN + + + See also + -------- + DataFrame.merge : For column(s)-on-columns(s) operations + Returns ------- joined : DataFrame From 9662d9140d95c405e9f74b5877196cd2b12d1232 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 26 May 2016 12:06:49 -0400 Subject: [PATCH 55/96] TST/CLN: remove np.assert_equal Author: sinhrks Closes #13263 from sinhrks/test_assert_equal and squashes the following commits: a5b2a67 [sinhrks] TST/CLN: remove np.assert_equal --- ci/lint.sh | 10 + pandas/computation/tests/test_eval.py | 23 +- pandas/io/tests/json/test_pandas.py | 10 +- pandas/io/tests/json/test_ujson.py | 14 +- pandas/io/tests/parser/common.py | 3 +- pandas/io/tests/test_excel.py | 9 +- pandas/io/tests/test_ga.py | 4 +- pandas/io/tests/test_html.py | 41 +- pandas/io/tests/test_stata.py | 16 +- pandas/io/tests/test_wb.py | 11 +- pandas/sparse/tests/test_libsparse.py | 17 +- pandas/sparse/tests/test_series.py | 26 +- pandas/stats/tests/test_ols.py | 3 +- pandas/stats/tests/test_var.py | 40 +- pandas/tests/frame/test_misc_api.py | 2 +- pandas/tests/frame/test_repr_info.py | 5 +- pandas/tests/frame/test_to_csv.py | 10 +- pandas/tests/indexes/test_numeric.py | 2 +- pandas/tests/indexing/test_indexing.py | 4 +- pandas/tests/series/test_analytics.py | 4 +- pandas/tests/test_categorical.py | 2 +- pandas/tests/test_expressions.py | 4 +- pandas/tests/test_generic.py | 5 +- pandas/tests/test_graphics.py | 6 +- pandas/tests/test_graphics_others.py | 2 +- pandas/tests/test_groupby.py | 29 +- pandas/tests/test_nanops.py | 10 +- pandas/tests/test_strings.py | 4 +- pandas/tests/test_window.py | 4 +- pandas/tools/tests/test_merge.py | 4 +- pandas/tools/tests/test_pivot.py | 30 +- pandas/tools/tests/test_util.py | 5 +- pandas/tseries/tests/test_converter.py | 9 +- pandas/tseries/tests/test_period.py | 761 ++++++++++++------------ pandas/tseries/tests/test_plotting.py | 4 +- pandas/tseries/tests/test_timedeltas.py | 9 +- pandas/tseries/tests/test_timeseries.py | 7 +- pandas/util/testing.py | 1 + 38 files changed, 571 insertions(+), 579 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index 6b8f160fc90db..eb4c655e8bd3e 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -15,7 +15,17 @@ if [ "$LINT" ]; then if [ $? -ne "0" ]; then RET=1 fi + done + echo "Linting DONE" + + echo "Check for invalid testing" + grep -r --include '*.py' --exclude nosetester.py --exclude testing.py 'numpy.testing' pandas + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for invalid testing DONE" + else echo "NOT Linting" fi diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 143e6017b462a..023519fd7fc20 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -12,8 +12,6 @@ from numpy.random import randn, rand, randint import numpy as np -from numpy.testing import assert_allclose -from numpy.testing.decorators import slow import pandas as pd from pandas.core import common as com @@ -33,7 +31,8 @@ import pandas.lib as lib from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_numpy_array_equal, - assert_produces_warning, assert_series_equal) + assert_produces_warning, assert_series_equal, + slow) from pandas.compat import PY3, u, reduce _series_frame_incompatible = _bool_ops_syms @@ -280,9 +279,13 @@ def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs - assert_allclose(result, expected) + + tm.assert_almost_equal(result, expected) expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) - assert_allclose(result, expected) + if isinstance(result, (DataFrame, Series)): + tm.assert_almost_equal(result.values, expected) + else: + tm.assert_almost_equal(result, expected.item()) def check_floor_division(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) @@ -319,13 +322,13 @@ def check_pow(self, lhs, arith1, rhs): self.assertRaises(AssertionError, tm.assert_numpy_array_equal, result, expected) else: - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) ex = '(lhs {0} rhs) {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = self.get_expected_pow_result( self.get_expected_pow_result(lhs, rhs), rhs) - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) def check_single_invert_op(self, lhs, cmp1, rhs): # simple @@ -701,10 +704,10 @@ def check_modulus(self, lhs, arith1, rhs): result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) expected = _eval_single_bin(expected, arith1, rhs, self.engine) - assert_allclose(result, expected) + tm.assert_almost_equal(result, expected) def check_alignment(self, result, nlhs, ghs, op): try: @@ -1578,7 +1581,7 @@ def test_binary_functions(self): expr = "{0}(a, b)".format(fn) got = self.eval(expr) expect = getattr(np, fn)(a, b) - np.testing.assert_allclose(got, expect) + tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): df = DataFrame({'a': np.random.randn(10), diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 6fe559e5cacd8..cad469de86fe9 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -87,7 +87,7 @@ def test_frame_double_encoded_labels(self): orient='index')) df_unser = read_json(df.to_json(orient='records'), orient='records') assert_index_equal(df.columns, df_unser.columns) - np.testing.assert_equal(df.values, df_unser.values) + tm.assert_numpy_array_equal(df.values, df_unser.values) def test_frame_non_unique_index(self): df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 1], @@ -100,9 +100,9 @@ def test_frame_non_unique_index(self): orient='split')) unser = read_json(df.to_json(orient='records'), orient='records') self.assertTrue(df.columns.equals(unser.columns)) - np.testing.assert_equal(df.values, unser.values) + tm.assert_numpy_array_equal(df.values, unser.values) unser = read_json(df.to_json(orient='values'), orient='values') - np.testing.assert_equal(df.values, unser.values) + tm.assert_numpy_array_equal(df.values, unser.values) def test_frame_non_unique_columns(self): df = DataFrame([['a', 'b'], ['c', 'd']], index=[1, 2], @@ -115,7 +115,7 @@ def test_frame_non_unique_columns(self): assert_frame_equal(df, read_json(df.to_json(orient='split'), orient='split', dtype=False)) unser = read_json(df.to_json(orient='values'), orient='values') - np.testing.assert_equal(df.values, unser.values) + tm.assert_numpy_array_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly df = DataFrame([['a', 'b'], ['c', 'd']], index=[ @@ -487,7 +487,7 @@ def test_series_non_unique_index(self): orient='split', typ='series')) unser = read_json(s.to_json(orient='records'), orient='records', typ='series') - np.testing.assert_equal(s.values, unser.values) + tm.assert_numpy_array_equal(s.values, unser.values) def test_series_from_json_to_json(self): diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index babcd910a2edd..8e4b492c984f1 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -21,8 +21,6 @@ import pandas.compat as compat import numpy as np -from numpy.testing import (assert_array_almost_equal_nulp, - assert_approx_equal) from pandas import DataFrame, Series, Index, NaT, DatetimeIndex import pandas.util.testing as tm @@ -1015,19 +1013,19 @@ def testFloatArray(self): inpt = arr.astype(dtype) outp = np.array(ujson.decode(ujson.encode( inpt, double_precision=15)), dtype=dtype) - assert_array_almost_equal_nulp(inpt, outp) + tm.assert_almost_equal(inpt, outp) def testFloatMax(self): num = np.float(np.finfo(np.float).max / 10) - assert_approx_equal(np.float(ujson.decode( + tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) num = np.float32(np.finfo(np.float32).max / 10) - assert_approx_equal(np.float32(ujson.decode( + tm.assert_almost_equal(np.float32(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) num = np.float64(np.finfo(np.float64).max / 10) - assert_approx_equal(np.float64(ujson.decode( + tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) def testArrays(self): @@ -1067,9 +1065,9 @@ def testArrays(self): arr = np.arange(100.202, 200.202, 1, dtype=np.float32) arr = arr.reshape((5, 5, 4)) outp = np.array(ujson.decode(ujson.encode(arr)), dtype=np.float32) - assert_array_almost_equal_nulp(arr, outp) + tm.assert_almost_equal(arr, outp) outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) - assert_array_almost_equal_nulp(arr, outp) + tm.assert_almost_equal(arr, outp) def testOdArray(self): def will_raise(): diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 3912bbbf11e53..2be0c4edb8f5d 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -10,7 +10,6 @@ import nose import numpy as np -from numpy.testing.decorators import slow from pandas.lib import Timestamp import pandas as pd @@ -607,7 +606,7 @@ def test_url(self): tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing - @slow + @tm.slow def test_file(self): # FILE diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index af053450d78c4..b7e5360a6f3db 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -13,7 +13,6 @@ from numpy import nan import numpy as np -from numpy.testing.decorators import slow import pandas as pd from pandas import DataFrame, Index, MultiIndex @@ -544,7 +543,7 @@ def test_read_from_s3_url(self): local_table = self.get_exceldf('test1') tm.assert_frame_equal(url_table, local_table) - @slow + @tm.slow def test_read_from_file_url(self): # FILE @@ -1102,9 +1101,9 @@ def test_sheets(self): tm.assert_frame_equal(self.frame, recons) recons = read_excel(reader, 'test2', index_col=0) tm.assert_frame_equal(self.tsframe, recons) - np.testing.assert_equal(2, len(reader.sheet_names)) - np.testing.assert_equal('test1', reader.sheet_names[0]) - np.testing.assert_equal('test2', reader.sheet_names[1]) + self.assertEqual(2, len(reader.sheet_names)) + self.assertEqual('test1', reader.sheet_names[0]) + self.assertEqual('test2', reader.sheet_names[1]) def test_colaliases(self): _skip_if_no_xlrd() diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py index b8b698691a9f5..469e121f633d7 100644 --- a/pandas/io/tests/test_ga.py +++ b/pandas/io/tests/test_ga.py @@ -7,8 +7,8 @@ import nose import pandas as pd from pandas import compat -from pandas.util.testing import network, assert_frame_equal, with_connectivity_check -from numpy.testing.decorators import slow +from pandas.util.testing import (network, assert_frame_equal, + with_connectivity_check, slow) import pandas.util.testing as tm if compat.PY3: diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 21d0748fb6aba..9b68267a0a0a8 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -16,7 +16,6 @@ import numpy as np from numpy.random import rand -from numpy.testing.decorators import slow from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) @@ -129,7 +128,7 @@ def test_spam_url(self): assert_framelist_equal(df1, df2) - @slow + @tm.slow def test_banklist(self): df1 = self.read_html(self.banklist_data, '.*Florida.*', attrs={'id': 'table'}) @@ -289,9 +288,9 @@ def test_invalid_url(self): self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') except ValueError as e: - tm.assert_equal(str(e), 'No tables found') + self.assertEqual(str(e), 'No tables found') - @slow + @tm.slow def test_file_url(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), 'First', @@ -300,7 +299,7 @@ def test_file_url(self): for df in dfs: tm.assertIsInstance(df, DataFrame) - @slow + @tm.slow def test_invalid_table_attrs(self): url = self.banklist_data with tm.assertRaisesRegexp(ValueError, 'No tables found'): @@ -311,39 +310,39 @@ def _bank_data(self, *args, **kwargs): return self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'}, *args, **kwargs) - @slow + @tm.slow def test_multiindex_header(self): df = self._bank_data(header=[0, 1])[0] tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_multiindex_index(self): df = self._bank_data(index_col=[0, 1])[0] tm.assertIsInstance(df.index, MultiIndex) - @slow + @tm.slow def test_multiindex_header_index(self): df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] tm.assertIsInstance(df.columns, MultiIndex) tm.assertIsInstance(df.index, MultiIndex) - @slow + @tm.slow def test_multiindex_header_skiprows_tuples(self): df = self._bank_data(header=[0, 1], skiprows=1, tupleize_cols=True)[0] tm.assertIsInstance(df.columns, Index) - @slow + @tm.slow def test_multiindex_header_skiprows(self): df = self._bank_data(header=[0, 1], skiprows=1)[0] tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_multiindex_header_index_skiprows(self): df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] tm.assertIsInstance(df.index, MultiIndex) tm.assertIsInstance(df.columns, MultiIndex) - @slow + @tm.slow def test_regex_idempotency(self): url = self.banklist_data dfs = self.read_html(file_path_to_url(url), @@ -371,7 +370,7 @@ def test_python_docs_table(self): zz = [df.iloc[0, 0][0:4] for df in dfs] self.assertEqual(sorted(zz), sorted(['Repo', 'What'])) - @slow + @tm.slow def test_thousands_macau_stats(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -381,7 +380,7 @@ def test_thousands_macau_stats(self): self.assertFalse(any(s.isnull().any() for _, s in df.iteritems())) - @slow + @tm.slow def test_thousands_macau_index_col(self): all_non_nan_table_index = -2 macau_data = os.path.join(DATA_PATH, 'macau.html') @@ -522,7 +521,7 @@ def test_nyse_wsj_commas_table(self): self.assertEqual(df.shape[0], nrows) self.assertTrue(df.columns.equals(columns)) - @slow + @tm.slow def test_banklist_header(self): from pandas.io.html import _remove_whitespace @@ -561,7 +560,7 @@ def try_remove_ws(x): coerce=True) tm.assert_frame_equal(converted, gtnew) - @slow + @tm.slow def test_gold_canyon(self): gc = 'Gold Canyon' with open(self.banklist_data, 'r') as f: @@ -663,7 +662,7 @@ def test_wikipedia_states_table(self): assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] - nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64')) + self.assertEqual(result['sq mi'].dtype, np.dtype('float64')) def test_bool_header_arg(self): # GH 6114 @@ -753,7 +752,7 @@ def test_works_on_valid_markup(self): tm.assertIsInstance(dfs, list) tm.assertIsInstance(dfs[0], DataFrame) - @slow + @tm.slow def test_fallback_success(self): _skip_if_none_of(('bs4', 'html5lib')) banklist_data = os.path.join(DATA_PATH, 'banklist.html') @@ -796,7 +795,7 @@ def get_elements_from_file(url, element='table'): return soup.find_all(element) -@slow +@tm.slow def test_bs4_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") with warnings.catch_warnings(): @@ -811,13 +810,13 @@ def get_lxml_elements(url, element): return doc.xpath('.//{0}'.format(element)) -@slow +@tm.slow def test_lxml_finds_tables(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'table') -@slow +@tm.slow def test_lxml_finds_tbody(): filepath = os.path.join(DATA_PATH, "spam.html") assert get_lxml_elements(filepath, 'tbody') diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 17f74d5789298..830c68d62efad 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -179,7 +179,7 @@ def test_read_dta2(self): w = [x for x in w if x.category is UserWarning] # should get warning for each call to read_dta - tm.assert_equal(len(w), 3) + self.assertEqual(len(w), 3) # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats @@ -375,7 +375,7 @@ def test_read_write_dta11(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( @@ -403,7 +403,7 @@ def test_read_write_dta12(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal( @@ -904,7 +904,7 @@ def test_categorical_warnings_and_errors(self): with warnings.catch_warnings(record=True) as w: original.to_stata(path) # should get a warning for mixed content - tm.assert_equal(len(w), 1) + self.assertEqual(len(w), 1) def test_categorical_with_stata_missing_values(self): values = [['a' + str(i)] for i in range(120)] @@ -986,10 +986,10 @@ def test_categorical_ordering(self): for col in parsed_115: if not is_categorical_dtype(parsed_115[col]): continue - tm.assert_equal(True, parsed_115[col].cat.ordered) - tm.assert_equal(True, parsed_117[col].cat.ordered) - tm.assert_equal(False, parsed_115_unordered[col].cat.ordered) - tm.assert_equal(False, parsed_117_unordered[col].cat.ordered) + self.assertEqual(True, parsed_115[col].cat.ordered) + self.assertEqual(True, parsed_117[col].cat.ordered) + self.assertEqual(False, parsed_115_unordered[col].cat.ordered) + self.assertEqual(False, parsed_117_unordered[col].cat.ordered) def test_read_chunks_117(self): files_117 = [self.dta1_117, self.dta2_117, self.dta3_117, diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py index 58386c3f1c145..42884b19de03a 100644 --- a/pandas/io/tests/test_wb.py +++ b/pandas/io/tests/test_wb.py @@ -6,7 +6,6 @@ from pandas.compat import u from pandas.util.testing import network from pandas.util.testing import assert_frame_equal -from numpy.testing.decorators import slow import pandas.util.testing as tm # deprecated @@ -15,7 +14,7 @@ class TestWB(tm.TestCase): - @slow + @tm.slow @network def test_wdi_search(self): @@ -26,7 +25,7 @@ def test_wdi_search(self): result = search('gdp.*capita.*constant') self.assertTrue(result.name.str.contains('GDP').any()) - @slow + @tm.slow @network def test_wdi_download(self): @@ -55,7 +54,7 @@ def test_wdi_download(self): expected.index = result.index assert_frame_equal(result, pandas.DataFrame(expected)) - @slow + @tm.slow @network def test_wdi_download_w_retired_indicator(self): @@ -85,7 +84,7 @@ def test_wdi_download_w_retired_indicator(self): if len(result) > 0: raise nose.SkipTest("Invalid results") - @slow + @tm.slow @network def test_wdi_download_w_crash_inducing_countrycode(self): @@ -103,7 +102,7 @@ def test_wdi_download_w_crash_inducing_countrycode(self): if len(result) > 0: raise nose.SkipTest("Invalid results") - @slow + @tm.slow @network def test_wdi_get_countries(self): result = get_countries() diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 352355fd55c23..6edae66d4e55b 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -3,7 +3,6 @@ import nose # noqa import numpy as np import operator -from numpy.testing import assert_equal import pandas.util.testing as tm from pandas import compat @@ -51,14 +50,15 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): yindex = BlockIndex(TEST_LENGTH, yloc, ylen) bresult = xindex.make_union(yindex) assert (isinstance(bresult, BlockIndex)) - assert_equal(bresult.blocs, eloc) - assert_equal(bresult.blengths, elen) + tm.assert_numpy_array_equal(bresult.blocs, eloc) + tm.assert_numpy_array_equal(bresult.blengths, elen) ixindex = xindex.to_int_index() iyindex = yindex.to_int_index() iresult = ixindex.make_union(iyindex) assert (isinstance(iresult, IntIndex)) - assert_equal(iresult.indices, bresult.to_int_index().indices) + tm.assert_numpy_array_equal(iresult.indices, + bresult.to_int_index().indices) """ x: ---- @@ -411,7 +411,7 @@ def test_to_int_index(self): block = BlockIndex(20, locs, lengths) dense = block.to_int_index() - assert_equal(dense.indices, exp_inds) + tm.assert_numpy_array_equal(dense.indices, exp_inds) def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) @@ -489,7 +489,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): ydindex, yfill) self.assertTrue(rb_index.to_int_index().equals(ri_index)) - assert_equal(result_block_vals, result_int_vals) + tm.assert_numpy_array_equal(result_block_vals, result_int_vals) # check versus Series... xseries = Series(x, xdindex.indices) @@ -501,8 +501,9 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): series_result = python_op(xseries, yseries) series_result = series_result.reindex(ri_index.indices) - assert_equal(result_block_vals, series_result.values) - assert_equal(result_int_vals, series_result.values) + tm.assert_numpy_array_equal(result_block_vals, + series_result.values) + tm.assert_numpy_array_equal(result_int_vals, series_result.values) check_cases(_check_case) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 5cbc509b836db..58e3dfbdf66e4 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -5,7 +5,6 @@ from numpy import nan import numpy as np import pandas as pd -from numpy.testing import assert_equal from pandas import Series, DataFrame, bdate_range from pandas.core.datetools import BDay @@ -148,20 +147,23 @@ def test_series_density(self): def test_sparse_to_dense(self): arr, index = _test_data1() series = self.bseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='bseries')) series = self.bseries.to_dense(sparse_only=True) - assert_equal(series, arr[np.isfinite(arr)]) + + indexer = np.isfinite(arr) + exp = Series(arr[indexer], index=index[indexer], name='bseries') + tm.assert_series_equal(series, exp) series = self.iseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='iseries')) arr, index = _test_data1_zero() series = self.zbseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr, name='zbseries')) series = self.ziseries.to_dense() - assert_equal(series, arr) + tm.assert_series_equal(series, Series(arr)) def test_to_dense_fill_value(self): s = pd.Series([1, np.nan, np.nan, 3, np.nan]) @@ -225,8 +227,8 @@ def test_constructor(self): tm.assertIsInstance(self.iseries.sp_index, IntIndex) self.assertEqual(self.zbseries.fill_value, 0) - assert_equal(self.zbseries.values.values, - self.bseries.to_dense().fillna(0).values) + tm.assert_numpy_array_equal(self.zbseries.values.values, + self.bseries.to_dense().fillna(0).values) # pass SparseSeries def _check_const(sparse, name): @@ -252,7 +254,7 @@ def _check_const(sparse, name): # pass Series bseries2 = SparseSeries(self.bseries.to_dense()) - assert_equal(self.bseries.sp_values, bseries2.sp_values) + tm.assert_numpy_array_equal(self.bseries.sp_values, bseries2.sp_values) # pass dict? @@ -292,7 +294,7 @@ def test_constructor_ndarray(self): def test_constructor_nonnan(self): arr = [0, 0, 0, nan, nan] sp_series = SparseSeries(arr, fill_value=0) - assert_equal(sp_series.values.values, arr) + tm.assert_numpy_array_equal(sp_series.values.values, arr) self.assertEqual(len(sp_series), 5) self.assertEqual(sp_series.shape, (5, )) @@ -1049,8 +1051,8 @@ def _check_results_to_coo(results, check): # or compare directly as difference of sparse # assert(abs(A - A_result).max() < 1e-12) # max is failing in python # 2.6 - assert_equal(il, il_result) - assert_equal(jl, jl_result) + tm.assert_numpy_array_equal(il, il_result) + tm.assert_numpy_array_equal(jl, jl_result) def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 725a4e8296dd2..4932ac8ffdf99 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -13,7 +13,6 @@ from distutils.version import LooseVersion import nose import numpy as np -from numpy.testing.decorators import slow from pandas import date_range, bdate_range from pandas.core.panel import Panel @@ -22,7 +21,7 @@ from pandas.stats.ols import _filter_data from pandas.stats.plm import NonPooledPanelOLS, PanelOLS from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assertRaisesRegexp) + assert_frame_equal, assertRaisesRegexp, slow) import pandas.util.testing as tm import pandas.compat as compat from .common import BaseTest diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py index 9bcd070dc1d33..9f2c95a2d3d5c 100644 --- a/pandas/stats/tests/test_var.py +++ b/pandas/stats/tests/test_var.py @@ -1,9 +1,8 @@ # flake8: noqa from __future__ import print_function -from numpy.testing import run_module_suite, assert_equal, TestCase -from pandas.util.testing import assert_almost_equal +import pandas.util.testing as tm from pandas.compat import range import nose @@ -33,53 +32,56 @@ class CheckVAR(object): def test_params(self): - assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) + tm.assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) def test_neqs(self): - assert_equal(self.res1.neqs, self.res2.neqs) + tm.assert_numpy_array_equal(self.res1.neqs, self.res2.neqs) def test_nobs(self): - assert_equal(self.res1.avobs, self.res2.nobs) + tm.assert_numpy_array_equal(self.res1.avobs, self.res2.nobs) def test_df_eq(self): - assert_equal(self.res1.df_eq, self.res2.df_eq) + tm.assert_numpy_array_equal(self.res1.df_eq, self.res2.df_eq) def test_rmse(self): results = self.res1.results for i in range(len(results)): - assert_almost_equal(results[i].mse_resid ** .5, - eval('self.res2.rmse_' + str(i + 1)), DECIMAL_6) + tm.assert_almost_equal(results[i].mse_resid ** .5, + eval('self.res2.rmse_' + str(i + 1)), + DECIMAL_6) def test_rsquared(self): results = self.res1.results for i in range(len(results)): - assert_almost_equal(results[i].rsquared, - eval('self.res2.rsquared_' + str(i + 1)), DECIMAL_3) + tm.assert_almost_equal(results[i].rsquared, + eval('self.res2.rsquared_' + str(i + 1)), + DECIMAL_3) def test_llf(self): results = self.res1.results - assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) + tm.assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) for i in range(len(results)): - assert_almost_equal(results[i].llf, - eval('self.res2.llf_' + str(i + 1)), DECIMAL_2) + tm.assert_almost_equal(results[i].llf, + eval('self.res2.llf_' + str(i + 1)), + DECIMAL_2) def test_aic(self): - assert_almost_equal(self.res1.aic, self.res2.aic) + tm.assert_almost_equal(self.res1.aic, self.res2.aic) def test_bic(self): - assert_almost_equal(self.res1.bic, self.res2.bic) + tm.assert_almost_equal(self.res1.bic, self.res2.bic) def test_hqic(self): - assert_almost_equal(self.res1.hqic, self.res2.hqic) + tm.assert_almost_equal(self.res1.hqic, self.res2.hqic) def test_fpe(self): - assert_almost_equal(self.res1.fpe, self.res2.fpe) + tm.assert_almost_equal(self.res1.fpe, self.res2.fpe) def test_detsig(self): - assert_almost_equal(self.res1.detomega, self.res2.detsig) + tm.assert_almost_equal(self.res1.detomega, self.res2.detsig) def test_bse(self): - assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) + tm.assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) class Foo(object): diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 0857d23dc1176..48b8d641a0f98 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -391,7 +391,7 @@ def test_repr_with_mi_nat(self): index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']]) res = repr(df) exp = ' X\nNaT a 1\n2013-01-01 b 2' - nose.tools.assert_equal(res, exp) + self.assertEqual(res, exp) def test_iterkv_deprecation(self): with tm.assert_produces_warning(FutureWarning): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 3d4be319092c3..66e592c013fb1 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -14,7 +14,6 @@ import pandas.formats.format as fmt import pandas as pd -from numpy.testing.decorators import slow import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -43,7 +42,7 @@ def test_repr_mixed(self): foo = repr(self.mixed_frame) # noqa self.mixed_frame.info(verbose=False, buf=buf) - @slow + @tm.slow def test_repr_mixed_big(self): # big mixed biggie = DataFrame({'A': np.random.randn(200), @@ -90,7 +89,7 @@ def test_repr_dimensions(self): with option_context('display.show_dimensions', 'truncate'): self.assertFalse("2 rows x 2 columns" in repr(df)) - @slow + @tm.slow def test_repr_big(self): # big one biggie = DataFrame(np.zeros((200, 4)), columns=lrange(4), diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 718f47eea3a0f..9a16714e18be3 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -14,14 +14,11 @@ import pandas as pd from pandas.util.testing import (assert_almost_equal, - assert_equal, assert_series_equal, assert_frame_equal, ensure_clean, makeCustomDataframe as mkdf, - assertRaisesRegexp) - -from numpy.testing.decorators import slow + assertRaisesRegexp, slow) import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -453,7 +450,7 @@ def test_to_csv_with_mix_columns(self): df = DataFrame({0: ['a', 'b', 'c'], 1: ['aa', 'bb', 'cc']}) df['test'] = 'txt' - assert_equal(df.to_csv(), df.to_csv(columns=[0, 1, 'test'])) + self.assertEqual(df.to_csv(), df.to_csv(columns=[0, 1, 'test'])) def test_to_csv_headers(self): # GH6186, the presence or absence of `index` incorrectly @@ -508,8 +505,7 @@ def test_to_csv_multiindex(self): # do not load index tsframe.to_csv(path) recons = DataFrame.from_csv(path, index_col=None) - np.testing.assert_equal( - len(recons.columns), len(tsframe.columns) + 2) + self.assertEqual(len(recons.columns), len(tsframe.columns) + 2) # no index tsframe.to_csv(path, index=False) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index abb9d55e27758..1247e4dc62997 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -358,7 +358,7 @@ def test_astype_from_object(self): index = Index([1.0, np.nan, 0.2], dtype='object') result = index.astype(float) expected = Float64Index([1.0, np.nan, 0.2]) - tm.assert_equal(result.dtype, expected.dtype) + self.assertEqual(result.dtype, expected.dtype) tm.assert_index_equal(result, expected) def test_fillna_float64(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 708006a9dc21b..e1fd17f0c26e0 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -20,14 +20,14 @@ MultiIndex, Timestamp, Timedelta) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_attr_equal) + assert_attr_equal, slow) from pandas.formats.printing import pprint_thing from pandas import concat, lib from pandas.core.common import PerformanceWarning import pandas.util.testing as tm from pandas import date_range -from numpy.testing.decorators import slow + _verbose = False diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 878a639a25aa5..34aaccb6464aa 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1356,7 +1356,7 @@ def test_searchsorted_numeric_dtypes_scalar(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted(30) e = 2 - tm.assert_equal(r, e) + self.assertEqual(r, e) r = s.searchsorted([30]) e = np.array([2], dtype=np.int64) @@ -1373,7 +1373,7 @@ def test_search_sorted_datetime64_scalar(self): v = pd.Timestamp('20120102') r = s.searchsorted(v) e = 1 - tm.assert_equal(r, e) + self.assertEqual(r, e) def test_search_sorted_datetime64_list(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5a0d079efb4c2..d74fe68617ea2 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1420,7 +1420,7 @@ def test_sort_values_na_position(self): def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] - tm.assert_equal(sliced, "d") + self.assertEqual(sliced, "d") sliced = cat[3:5] expected = Categorical(["d", "a"], categories=['a', 'b', 'c', 'd']) self.assert_numpy_array_equal(sliced._codes, expected._codes) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 044272f24a21f..b6ed5dc68f905 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -15,10 +15,10 @@ from pandas import compat from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_panel4d_equal) + assert_panel4d_equal, slow) from pandas.formats.printing import pprint_thing import pandas.util.testing as tm -from numpy.testing.decorators import slow + if not expr._USE_NUMEXPR: try: diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 794b5e8aa5650..36962a37ec898 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -21,8 +21,7 @@ assert_frame_equal, assert_panel_equal, assert_panel4d_equal, - assert_almost_equal, - assert_equal) + assert_almost_equal) import pandas.util.testing as tm @@ -1346,7 +1345,7 @@ def test_set_attribute(self): df['y'] = [2, 4, 6] df.y = 5 - assert_equal(df.y, 5) + self.assertEqual(df.y, 5) assert_series_equal(df['y'], Series([2, 4, 6], name='y')) def test_pct_change(self): diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 3820a9d5f6476..b59d6ac0027dd 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -19,7 +19,7 @@ import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import (ensure_clean, - assert_is_valid_plot_return_object) + assert_is_valid_plot_return_object, slow) from pandas.core.config import set_option @@ -27,8 +27,6 @@ from numpy import random from numpy.random import rand, randn -from numpy.testing import assert_allclose -from numpy.testing.decorators import slow import pandas.tools.plotting as plotting """ These tests are for ``Dataframe.plot`` and ``Series.plot``. @@ -140,7 +138,7 @@ def _check_data(self, xp, rs): def check_line(xpl, rsl): xpdata = xpl.get_xydata() rsdata = rsl.get_xydata() - assert_allclose(xpdata, rsdata) + tm.assert_almost_equal(xpdata, rsdata) self.assertEqual(len(xp_lines), len(rs_lines)) [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py index b032ce196c113..7285d84865542 100644 --- a/pandas/tests/test_graphics_others.py +++ b/pandas/tests/test_graphics_others.py @@ -11,12 +11,12 @@ from pandas import Series, DataFrame, MultiIndex from pandas.compat import range, lmap, lzip import pandas.util.testing as tm +from pandas.util.testing import slow import numpy as np from numpy import random from numpy.random import randn -from numpy.testing.decorators import slow import pandas.tools.plotting as plotting from pandas.tests.test_graphics import (TestPlotBase, _check_plot_works, diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 38e6a066d3eea..1996d132e01ba 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -31,7 +31,6 @@ import pandas.util.testing as tm import pandas as pd -from numpy.testing import assert_equal class TestGroupBy(tm.TestCase): @@ -4621,10 +4620,10 @@ def test_timezone_info(self): import pytz df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]}) - tm.assert_equal(df['b'][0].tzinfo, pytz.utc) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) df = pd.DataFrame({'a': [1, 2, 3]}) df['b'] = datetime.now(pytz.utc) - tm.assert_equal(df['b'][0].tzinfo, pytz.utc) + self.assertEqual(df['b'][0].tzinfo, pytz.utc) def test_groupby_with_timegrouper(self): # GH 4161 @@ -5855,24 +5854,24 @@ def test_lexsort_indexer(self): # orders=True, na_position='last' result = _lexsort_indexer(keys, orders=True, na_position='last') expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_nargsort(self): # np.argsort(items) places NaNs last @@ -5899,53 +5898,53 @@ def test_nargsort(self): result = _nargsort(items, kind='mergesort', ascending=True, na_position='last') expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=True, na_position='first' result = _nargsort(items, kind='mergesort', ascending=True, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=False, na_position='last' result = _nargsort(items, kind='mergesort', ascending=False, na_position='last') expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=False, na_position='first' result = _nargsort(items, kind='mergesort', ascending=False, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=True, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='last') expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=True, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=False, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='last') expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # mergesort, ascending=False, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='first') expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_datetime_count(self): df = DataFrame({'a': [1, 2, 3] * 2, diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index d33a64002c3b1..7f8fb8fa424d1 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -873,17 +873,15 @@ def test_ground_truth(self): for axis in range(2): for ddof in range(3): var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof) - np.testing.assert_array_almost_equal(var[:3], - variance[axis, ddof]) - np.testing.assert_equal(var[3], np.nan) + tm.assert_almost_equal(var[:3], variance[axis, ddof]) + self.assertTrue(np.isnan(var[3])) # Test nanstd. for axis in range(2): for ddof in range(3): std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof) - np.testing.assert_array_almost_equal( - std[:3], variance[axis, ddof] ** 0.5) - np.testing.assert_equal(std[3], np.nan) + tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5) + self.assertTrue(np.isnan(std[3])) def test_nanstd_roundoff(self): # Regression test for GH 10242 (test data taken from GH 10489). Ensure diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 05525acedc245..423a288077c4d 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -573,7 +573,7 @@ def test_extract_expand_False(self): # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result = s_or_idx.str.extract(r'(?PA)\d', expand=False) - tm.assert_equal(result.name, 'uno') + self.assertEqual(result.name, 'uno') tm.assert_numpy_array_equal(result, klass(['A', 'A'])) s = Series(['A1', 'B2', 'C3']) @@ -1105,7 +1105,7 @@ def test_empty_str_methods(self): # (extract) on empty series tm.assert_series_equal(empty_str, empty.str.cat(empty)) - tm.assert_equal('', empty.str.cat()) + self.assertEqual('', empty.str.cat()) tm.assert_series_equal(empty_str, empty.str.title()) tm.assert_series_equal(empty_int, empty.str.count('a')) tm.assert_series_equal(empty_bool, empty.str.contains('a')) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 8d9a55bade30d..1185f95dbd51f 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -6,7 +6,6 @@ from nose.tools import assert_raises from datetime import datetime from numpy.random import randn -from numpy.testing.decorators import slow import numpy as np from distutils.version import LooseVersion @@ -15,7 +14,8 @@ notnull, concat) from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, - assert_index_equal, assert_numpy_array_equal) + assert_index_equal, assert_numpy_array_equal, + slow) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 13f00afb5a489..474ce0f899217 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -17,12 +17,12 @@ from pandas.util.testing import (assert_frame_equal, assert_series_equal, assert_almost_equal, makeCustomDataframe as mkdf, - assertRaisesRegexp) + assertRaisesRegexp, slow) from pandas import (isnull, DataFrame, Index, MultiIndex, Panel, Series, date_range, read_csv) import pandas.algos as algos import pandas.util.testing as tm -from numpy.testing.decorators import slow + a_ = np.array diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 5ebd2e4f693cf..82feaae13f771 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,13 +1,12 @@ from datetime import datetime, date, timedelta import numpy as np -from numpy.testing import assert_equal import pandas as pd from pandas import DataFrame, Series, Index, MultiIndex, Grouper from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab -from pandas.compat import range, u, product +from pandas.compat import range, product import pandas.util.testing as tm @@ -80,21 +79,13 @@ def test_pivot_table_dropna(self): pv_ind = df.pivot_table( 'quantity', ['customer', 'product'], 'month', dropna=False) - m = MultiIndex.from_tuples([(u('A'), u('a')), - (u('A'), u('b')), - (u('A'), u('c')), - (u('A'), u('d')), - (u('B'), u('a')), - (u('B'), u('b')), - (u('B'), u('c')), - (u('B'), u('d')), - (u('C'), u('a')), - (u('C'), u('b')), - (u('C'), u('c')), - (u('C'), u('d'))]) - - assert_equal(pv_col.columns.values, m.values) - assert_equal(pv_ind.index.values, m.values) + m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'), + ('A', 'd'), ('B', 'a'), ('B', 'b'), + ('B', 'c'), ('B', 'd'), ('C', 'a'), + ('C', 'b'), ('C', 'c'), ('C', 'd')], + names=['customer', 'product']) + tm.assert_index_equal(pv_col.columns, m) + tm.assert_index_equal(pv_ind.index, m) def test_pass_array(self): result = self.data.pivot_table( @@ -902,8 +893,9 @@ def test_crosstab_dropna(self): res = pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False) m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'), - ('two', 'dull'), ('two', 'shiny')]) - assert_equal(res.columns.values, m.values) + ('two', 'dull'), ('two', 'shiny')], + names=['b', 'c']) + tm.assert_index_equal(res.columns, m) def test_categorical_margins(self): # GH 10989 diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 1c4f55b2defa4..92a41199f264d 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -4,7 +4,6 @@ import nose import numpy as np -from numpy.testing import assert_equal import pandas as pd from pandas import date_range, Index @@ -22,7 +21,7 @@ def test_simple(self): result = cartesian_product([x, y]) expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), np.array([1, 22, 1, 22, 1, 22])] - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_datetimeindex(self): # regression test for GitHub issue #6439 @@ -30,7 +29,7 @@ def test_datetimeindex(self): x = date_range('2000-01-01', periods=2) result = [Index(y).day for y in cartesian_product([x, x])] expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] - assert_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) class TestLocaleUtils(tm.TestCase): diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index f2c20f7d3111d..ceb8660efb9cd 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,7 +3,6 @@ import nose import numpy as np -from numpy.testing import assert_almost_equal as np_assert_almost_equal from pandas import Timestamp, Period from pandas.compat import u import pandas.util.testing as tm @@ -69,14 +68,14 @@ def test_conversion_float(self): rs = self.dtc.convert( Timestamp('2012-1-1 01:02:03', tz='UTC'), None, None) xp = converter.dates.date2num(Timestamp('2012-1-1 01:02:03', tz='UTC')) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert( Timestamp('2012-1-1 09:02:03', tz='Asia/Hong_Kong'), None, None) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) def test_time_formatter(self): self.tc(90000) @@ -88,7 +87,7 @@ def test_dateindex_conversion(self): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) xp = converter.dates.date2num(dateindex._mpl_repr()) - np_assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, decimals) def test_resolution(self): def _assert_less(ts1, ts2): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 167690e4846e9..b0df824f0a832 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -8,8 +8,6 @@ from datetime import datetime, date, timedelta -from numpy.ma.testutils import assert_equal - from pandas import Timestamp from pandas.tseries.frequencies import MONTHS, DAYS, _period_code_map from pandas.tseries.period import Period, PeriodIndex, period_range @@ -625,7 +623,7 @@ def _ex(*args): def test_properties_annually(self): # Test properties on Periods with annually frequency. a_date = Period(freq='A', year=2007) - assert_equal(a_date.year, 2007) + self.assertEqual(a_date.year, 2007) def test_properties_quarterly(self): # Test properties on Periods with daily frequency. @@ -635,78 +633,78 @@ def test_properties_quarterly(self): # for x in range(3): for qd in (qedec_date, qejan_date, qejun_date): - assert_equal((qd + x).qyear, 2007) - assert_equal((qd + x).quarter, x + 1) + self.assertEqual((qd + x).qyear, 2007) + self.assertEqual((qd + x).quarter, x + 1) def test_properties_monthly(self): # Test properties on Periods with daily frequency. m_date = Period(freq='M', year=2007, month=1) for x in range(11): m_ival_x = m_date + x - assert_equal(m_ival_x.year, 2007) + self.assertEqual(m_ival_x.year, 2007) if 1 <= x + 1 <= 3: - assert_equal(m_ival_x.quarter, 1) + self.assertEqual(m_ival_x.quarter, 1) elif 4 <= x + 1 <= 6: - assert_equal(m_ival_x.quarter, 2) + self.assertEqual(m_ival_x.quarter, 2) elif 7 <= x + 1 <= 9: - assert_equal(m_ival_x.quarter, 3) + self.assertEqual(m_ival_x.quarter, 3) elif 10 <= x + 1 <= 12: - assert_equal(m_ival_x.quarter, 4) - assert_equal(m_ival_x.month, x + 1) + self.assertEqual(m_ival_x.quarter, 4) + self.assertEqual(m_ival_x.month, x + 1) def test_properties_weekly(self): # Test properties on Periods with daily frequency. w_date = Period(freq='W', year=2007, month=1, day=7) # - assert_equal(w_date.year, 2007) - assert_equal(w_date.quarter, 1) - assert_equal(w_date.month, 1) - assert_equal(w_date.week, 1) - assert_equal((w_date - 1).week, 52) - assert_equal(w_date.days_in_month, 31) - assert_equal(Period(freq='W', year=2012, - month=2, day=1).days_in_month, 29) + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) + self.assertEqual(Period(freq='W', year=2012, + month=2, day=1).days_in_month, 29) def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. with tm.assert_produces_warning(FutureWarning): w_date = Period(freq='WK', year=2007, month=1, day=7) # - assert_equal(w_date.year, 2007) - assert_equal(w_date.quarter, 1) - assert_equal(w_date.month, 1) - assert_equal(w_date.week, 1) - assert_equal((w_date - 1).week, 52) - assert_equal(w_date.days_in_month, 31) + self.assertEqual(w_date.year, 2007) + self.assertEqual(w_date.quarter, 1) + self.assertEqual(w_date.month, 1) + self.assertEqual(w_date.week, 1) + self.assertEqual((w_date - 1).week, 52) + self.assertEqual(w_date.days_in_month, 31) with tm.assert_produces_warning(FutureWarning): exp = Period(freq='WK', year=2012, month=2, day=1) - assert_equal(exp.days_in_month, 29) + self.assertEqual(exp.days_in_month, 29) def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) # - assert_equal(b_date.year, 2007) - assert_equal(b_date.quarter, 1) - assert_equal(b_date.month, 1) - assert_equal(b_date.day, 1) - assert_equal(b_date.weekday, 0) - assert_equal(b_date.dayofyear, 1) - assert_equal(b_date.days_in_month, 31) - assert_equal(Period(freq='B', year=2012, - month=2, day=1).days_in_month, 29) + self.assertEqual(b_date.year, 2007) + self.assertEqual(b_date.quarter, 1) + self.assertEqual(b_date.month, 1) + self.assertEqual(b_date.day, 1) + self.assertEqual(b_date.weekday, 0) + self.assertEqual(b_date.dayofyear, 1) + self.assertEqual(b_date.days_in_month, 31) + self.assertEqual(Period(freq='B', year=2012, + month=2, day=1).days_in_month, 29) # d_date = Period(freq='D', year=2007, month=1, day=1) # - assert_equal(d_date.year, 2007) - assert_equal(d_date.quarter, 1) - assert_equal(d_date.month, 1) - assert_equal(d_date.day, 1) - assert_equal(d_date.weekday, 0) - assert_equal(d_date.dayofyear, 1) - assert_equal(d_date.days_in_month, 31) - assert_equal(Period(freq='D', year=2012, month=2, - day=1).days_in_month, 29) + self.assertEqual(d_date.year, 2007) + self.assertEqual(d_date.quarter, 1) + self.assertEqual(d_date.month, 1) + self.assertEqual(d_date.day, 1) + self.assertEqual(d_date.weekday, 0) + self.assertEqual(d_date.dayofyear, 1) + self.assertEqual(d_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, + day=1).days_in_month, 29) def test_properties_hourly(self): # Test properties on Periods with hourly frequency. @@ -714,50 +712,50 @@ def test_properties_hourly(self): h_date2 = Period(freq='2H', year=2007, month=1, day=1, hour=0) for h_date in [h_date1, h_date2]: - assert_equal(h_date.year, 2007) - assert_equal(h_date.quarter, 1) - assert_equal(h_date.month, 1) - assert_equal(h_date.day, 1) - assert_equal(h_date.weekday, 0) - assert_equal(h_date.dayofyear, 1) - assert_equal(h_date.hour, 0) - assert_equal(h_date.days_in_month, 31) - assert_equal(Period(freq='H', year=2012, month=2, day=1, - hour=0).days_in_month, 29) + self.assertEqual(h_date.year, 2007) + self.assertEqual(h_date.quarter, 1) + self.assertEqual(h_date.month, 1) + self.assertEqual(h_date.day, 1) + self.assertEqual(h_date.weekday, 0) + self.assertEqual(h_date.dayofyear, 1) + self.assertEqual(h_date.hour, 0) + self.assertEqual(h_date.days_in_month, 31) + self.assertEqual(Period(freq='H', year=2012, month=2, day=1, + hour=0).days_in_month, 29) def test_properties_minutely(self): # Test properties on Periods with minutely frequency. t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0) # - assert_equal(t_date.quarter, 1) - assert_equal(t_date.month, 1) - assert_equal(t_date.day, 1) - assert_equal(t_date.weekday, 0) - assert_equal(t_date.dayofyear, 1) - assert_equal(t_date.hour, 0) - assert_equal(t_date.minute, 0) - assert_equal(t_date.days_in_month, 31) - assert_equal(Period(freq='D', year=2012, month=2, day=1, hour=0, - minute=0).days_in_month, 29) + self.assertEqual(t_date.quarter, 1) + self.assertEqual(t_date.month, 1) + self.assertEqual(t_date.day, 1) + self.assertEqual(t_date.weekday, 0) + self.assertEqual(t_date.dayofyear, 1) + self.assertEqual(t_date.hour, 0) + self.assertEqual(t_date.minute, 0) + self.assertEqual(t_date.days_in_month, 31) + self.assertEqual(Period(freq='D', year=2012, month=2, day=1, hour=0, + minute=0).days_in_month, 29) def test_properties_secondly(self): # Test properties on Periods with secondly frequency. s_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0, second=0) # - assert_equal(s_date.year, 2007) - assert_equal(s_date.quarter, 1) - assert_equal(s_date.month, 1) - assert_equal(s_date.day, 1) - assert_equal(s_date.weekday, 0) - assert_equal(s_date.dayofyear, 1) - assert_equal(s_date.hour, 0) - assert_equal(s_date.minute, 0) - assert_equal(s_date.second, 0) - assert_equal(s_date.days_in_month, 31) - assert_equal(Period(freq='Min', year=2012, month=2, day=1, hour=0, - minute=0, second=0).days_in_month, 29) + self.assertEqual(s_date.year, 2007) + self.assertEqual(s_date.quarter, 1) + self.assertEqual(s_date.month, 1) + self.assertEqual(s_date.day, 1) + self.assertEqual(s_date.weekday, 0) + self.assertEqual(s_date.dayofyear, 1) + self.assertEqual(s_date.hour, 0) + self.assertEqual(s_date.minute, 0) + self.assertEqual(s_date.second, 0) + self.assertEqual(s_date.days_in_month, 31) + self.assertEqual(Period(freq='Min', year=2012, month=2, day=1, hour=0, + minute=0, second=0).days_in_month, 29) def test_properties_nat(self): p_nat = Period('NaT', freq='M') @@ -894,35 +892,35 @@ def test_conv_annual(self): ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) - assert_equal(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) - assert_equal(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) - assert_equal(ival_A.asfreq('M', 's'), ival_A_to_M_start) - assert_equal(ival_A.asfreq('M', 'E'), ival_A_to_M_end) - assert_equal(ival_A.asfreq('W', 'S'), ival_A_to_W_start) - assert_equal(ival_A.asfreq('W', 'E'), ival_A_to_W_end) - assert_equal(ival_A.asfreq('B', 'S'), ival_A_to_B_start) - assert_equal(ival_A.asfreq('B', 'E'), ival_A_to_B_end) - assert_equal(ival_A.asfreq('D', 'S'), ival_A_to_D_start) - assert_equal(ival_A.asfreq('D', 'E'), ival_A_to_D_end) - assert_equal(ival_A.asfreq('H', 'S'), ival_A_to_H_start) - assert_equal(ival_A.asfreq('H', 'E'), ival_A_to_H_end) - assert_equal(ival_A.asfreq('min', 'S'), ival_A_to_T_start) - assert_equal(ival_A.asfreq('min', 'E'), ival_A_to_T_end) - assert_equal(ival_A.asfreq('T', 'S'), ival_A_to_T_start) - assert_equal(ival_A.asfreq('T', 'E'), ival_A_to_T_end) - assert_equal(ival_A.asfreq('S', 'S'), ival_A_to_S_start) - assert_equal(ival_A.asfreq('S', 'E'), ival_A_to_S_end) - - assert_equal(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) - assert_equal(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) - - assert_equal(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) - assert_equal(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) - - assert_equal(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) - assert_equal(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) - - assert_equal(ival_A.asfreq('A'), ival_A) + self.assertEqual(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + self.assertEqual(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + self.assertEqual(ival_A.asfreq('M', 's'), ival_A_to_M_start) + self.assertEqual(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + self.assertEqual(ival_A.asfreq('W', 'S'), ival_A_to_W_start) + self.assertEqual(ival_A.asfreq('W', 'E'), ival_A_to_W_end) + self.assertEqual(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + self.assertEqual(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + self.assertEqual(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + self.assertEqual(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + self.assertEqual(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + self.assertEqual(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + self.assertEqual(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + self.assertEqual(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + self.assertEqual(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + self.assertEqual(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + self.assertEqual(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + self.assertEqual(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + self.assertEqual(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + self.assertEqual(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + self.assertEqual(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + self.assertEqual(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + self.assertEqual(ival_A.asfreq('A'), ival_A) def test_conv_quarterly(self): # frequency conversion tests: from Quarterly Frequency @@ -959,30 +957,30 @@ def test_conv_quarterly(self): ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) - assert_equal(ival_Q.asfreq('A'), ival_Q_to_A) - assert_equal(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) - - assert_equal(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) - assert_equal(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) - assert_equal(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) - assert_equal(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) - assert_equal(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) - assert_equal(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) - assert_equal(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) - assert_equal(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) - assert_equal(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) - assert_equal(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) - assert_equal(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) - assert_equal(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) - assert_equal(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) - assert_equal(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) - - assert_equal(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) - assert_equal(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) - assert_equal(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) - assert_equal(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) - - assert_equal(ival_Q.asfreq('Q'), ival_Q) + self.assertEqual(ival_Q.asfreq('A'), ival_Q_to_A) + self.assertEqual(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + self.assertEqual(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + self.assertEqual(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + self.assertEqual(ival_Q.asfreq('W', 'S'), ival_Q_to_W_start) + self.assertEqual(ival_Q.asfreq('W', 'E'), ival_Q_to_W_end) + self.assertEqual(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + self.assertEqual(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + self.assertEqual(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + self.assertEqual(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + self.assertEqual(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + self.assertEqual(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + self.assertEqual(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + self.assertEqual(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + self.assertEqual(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + self.assertEqual(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + self.assertEqual(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + self.assertEqual(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + self.assertEqual(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + self.assertEqual(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + self.assertEqual(ival_Q.asfreq('Q'), ival_Q) def test_conv_monthly(self): # frequency conversion tests: from Monthly Frequency @@ -1009,25 +1007,25 @@ def test_conv_monthly(self): ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, hour=23, minute=59, second=59) - assert_equal(ival_M.asfreq('A'), ival_M_to_A) - assert_equal(ival_M_end_of_year.asfreq('A'), ival_M_to_A) - assert_equal(ival_M.asfreq('Q'), ival_M_to_Q) - assert_equal(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) - - assert_equal(ival_M.asfreq('W', 'S'), ival_M_to_W_start) - assert_equal(ival_M.asfreq('W', 'E'), ival_M_to_W_end) - assert_equal(ival_M.asfreq('B', 'S'), ival_M_to_B_start) - assert_equal(ival_M.asfreq('B', 'E'), ival_M_to_B_end) - assert_equal(ival_M.asfreq('D', 'S'), ival_M_to_D_start) - assert_equal(ival_M.asfreq('D', 'E'), ival_M_to_D_end) - assert_equal(ival_M.asfreq('H', 'S'), ival_M_to_H_start) - assert_equal(ival_M.asfreq('H', 'E'), ival_M_to_H_end) - assert_equal(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) - assert_equal(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) - assert_equal(ival_M.asfreq('S', 'S'), ival_M_to_S_start) - assert_equal(ival_M.asfreq('S', 'E'), ival_M_to_S_end) - - assert_equal(ival_M.asfreq('M'), ival_M) + self.assertEqual(ival_M.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + self.assertEqual(ival_M.asfreq('Q'), ival_M_to_Q) + self.assertEqual(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + self.assertEqual(ival_M.asfreq('W', 'S'), ival_M_to_W_start) + self.assertEqual(ival_M.asfreq('W', 'E'), ival_M_to_W_end) + self.assertEqual(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + self.assertEqual(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + self.assertEqual(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + self.assertEqual(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + self.assertEqual(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + self.assertEqual(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + self.assertEqual(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + self.assertEqual(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + self.assertEqual(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + self.assertEqual(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + self.assertEqual(ival_M.asfreq('M'), ival_M) def test_conv_weekly(self): # frequency conversion tests: from Weekly Frequency @@ -1093,43 +1091,45 @@ def test_conv_weekly(self): ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, minute=59, second=59) - assert_equal(ival_W.asfreq('A'), ival_W_to_A) - assert_equal(ival_W_end_of_year.asfreq('A'), ival_W_to_A_end_of_year) - assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) - assert_equal(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - assert_equal(ival_W.asfreq('M'), ival_W_to_M) - assert_equal(ival_W_end_of_month.asfreq('M'), ival_W_to_M_end_of_month) - - assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - assert_equal(ival_W.asfreq('W'), ival_W) + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + self.assertEqual(ival_W.asfreq('W'), ival_W) def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency @@ -1208,44 +1208,46 @@ def test_conv_weekly_legacy(self): ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, minute=59, second=59) - assert_equal(ival_W.asfreq('A'), ival_W_to_A) - assert_equal(ival_W_end_of_year.asfreq('A'), ival_W_to_A_end_of_year) - assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) - assert_equal(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - assert_equal(ival_W.asfreq('M'), ival_W_to_M) - assert_equal(ival_W_end_of_month.asfreq('M'), ival_W_to_M_end_of_month) - - assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) + self.assertEqual(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) + self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) + self.assertEqual(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) with tm.assert_produces_warning(FutureWarning): - assert_equal(ival_W.asfreq('WK'), ival_W) + self.assertEqual(ival_W.asfreq('WK'), ival_W) def test_conv_business(self): # frequency conversion tests: from Business Frequency" @@ -1272,25 +1274,25 @@ def test_conv_business(self): ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, minute=59, second=59) - assert_equal(ival_B.asfreq('A'), ival_B_to_A) - assert_equal(ival_B_end_of_year.asfreq('A'), ival_B_to_A) - assert_equal(ival_B.asfreq('Q'), ival_B_to_Q) - assert_equal(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) - assert_equal(ival_B.asfreq('M'), ival_B_to_M) - assert_equal(ival_B_end_of_month.asfreq('M'), ival_B_to_M) - assert_equal(ival_B.asfreq('W'), ival_B_to_W) - assert_equal(ival_B_end_of_week.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + self.assertEqual(ival_B.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + self.assertEqual(ival_B.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + self.assertEqual(ival_B.asfreq('W'), ival_B_to_W) + self.assertEqual(ival_B_end_of_week.asfreq('W'), ival_B_to_W) - assert_equal(ival_B.asfreq('D'), ival_B_to_D) + self.assertEqual(ival_B.asfreq('D'), ival_B_to_D) - assert_equal(ival_B.asfreq('H', 'S'), ival_B_to_H_start) - assert_equal(ival_B.asfreq('H', 'E'), ival_B_to_H_end) - assert_equal(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) - assert_equal(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) - assert_equal(ival_B.asfreq('S', 'S'), ival_B_to_S_start) - assert_equal(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + self.assertEqual(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + self.assertEqual(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + self.assertEqual(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + self.assertEqual(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + self.assertEqual(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + self.assertEqual(ival_B.asfreq('S', 'E'), ival_B_to_S_end) - assert_equal(ival_B.asfreq('B'), ival_B) + self.assertEqual(ival_B.asfreq('B'), ival_B) def test_conv_daily(self): # frequency conversion tests: from Business Frequency" @@ -1335,36 +1337,39 @@ def test_conv_daily(self): ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=23, minute=59, second=59) - assert_equal(ival_D.asfreq('A'), ival_D_to_A) - - assert_equal(ival_D_end_of_quarter.asfreq('A-JAN'), ival_Deoq_to_AJAN) - assert_equal(ival_D_end_of_quarter.asfreq('A-JUN'), ival_Deoq_to_AJUN) - assert_equal(ival_D_end_of_quarter.asfreq('A-DEC'), ival_Deoq_to_ADEC) - - assert_equal(ival_D_end_of_year.asfreq('A'), ival_D_to_A) - assert_equal(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) - assert_equal(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) - assert_equal(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) - assert_equal(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) - assert_equal(ival_D.asfreq('M'), ival_D_to_M) - assert_equal(ival_D_end_of_month.asfreq('M'), ival_D_to_M) - assert_equal(ival_D.asfreq('W'), ival_D_to_W) - assert_equal(ival_D_end_of_week.asfreq('W'), ival_D_to_W) - - assert_equal(ival_D_friday.asfreq('B'), ival_B_friday) - assert_equal(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) - assert_equal(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) - assert_equal(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) - assert_equal(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) - - assert_equal(ival_D.asfreq('H', 'S'), ival_D_to_H_start) - assert_equal(ival_D.asfreq('H', 'E'), ival_D_to_H_end) - assert_equal(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) - assert_equal(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) - assert_equal(ival_D.asfreq('S', 'S'), ival_D_to_S_start) - assert_equal(ival_D.asfreq('S', 'E'), ival_D_to_S_end) - - assert_equal(ival_D.asfreq('D'), ival_D) + self.assertEqual(ival_D.asfreq('A'), ival_D_to_A) + + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + self.assertEqual(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + self.assertEqual(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + self.assertEqual(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + self.assertEqual(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + self.assertEqual(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + self.assertEqual(ival_D.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + self.assertEqual(ival_D.asfreq('W'), ival_D_to_W) + self.assertEqual(ival_D_end_of_week.asfreq('W'), ival_D_to_W) + + self.assertEqual(ival_D_friday.asfreq('B'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + self.assertEqual(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + self.assertEqual(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + self.assertEqual(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + self.assertEqual(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + self.assertEqual(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + self.assertEqual(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + self.assertEqual(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + self.assertEqual(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + self.assertEqual(ival_D.asfreq('D'), ival_D) def test_conv_hourly(self): # frequency conversion tests: from Hourly Frequency" @@ -1399,25 +1404,25 @@ def test_conv_hourly(self): ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=59, second=59) - assert_equal(ival_H.asfreq('A'), ival_H_to_A) - assert_equal(ival_H_end_of_year.asfreq('A'), ival_H_to_A) - assert_equal(ival_H.asfreq('Q'), ival_H_to_Q) - assert_equal(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) - assert_equal(ival_H.asfreq('M'), ival_H_to_M) - assert_equal(ival_H_end_of_month.asfreq('M'), ival_H_to_M) - assert_equal(ival_H.asfreq('W'), ival_H_to_W) - assert_equal(ival_H_end_of_week.asfreq('W'), ival_H_to_W) - assert_equal(ival_H.asfreq('D'), ival_H_to_D) - assert_equal(ival_H_end_of_day.asfreq('D'), ival_H_to_D) - assert_equal(ival_H.asfreq('B'), ival_H_to_B) - assert_equal(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) - - assert_equal(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) - assert_equal(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) - assert_equal(ival_H.asfreq('S', 'S'), ival_H_to_S_start) - assert_equal(ival_H.asfreq('S', 'E'), ival_H_to_S_end) - - assert_equal(ival_H.asfreq('H'), ival_H) + self.assertEqual(ival_H.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + self.assertEqual(ival_H.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + self.assertEqual(ival_H.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + self.assertEqual(ival_H.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H_end_of_week.asfreq('W'), ival_H_to_W) + self.assertEqual(ival_H.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + self.assertEqual(ival_H.asfreq('B'), ival_H_to_B) + self.assertEqual(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + self.assertEqual(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + self.assertEqual(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + self.assertEqual(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + self.assertEqual(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + self.assertEqual(ival_H.asfreq('H'), ival_H) def test_conv_minutely(self): # frequency conversion tests: from Minutely Frequency" @@ -1452,25 +1457,25 @@ def test_conv_minutely(self): ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, hour=0, minute=0, second=59) - assert_equal(ival_T.asfreq('A'), ival_T_to_A) - assert_equal(ival_T_end_of_year.asfreq('A'), ival_T_to_A) - assert_equal(ival_T.asfreq('Q'), ival_T_to_Q) - assert_equal(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) - assert_equal(ival_T.asfreq('M'), ival_T_to_M) - assert_equal(ival_T_end_of_month.asfreq('M'), ival_T_to_M) - assert_equal(ival_T.asfreq('W'), ival_T_to_W) - assert_equal(ival_T_end_of_week.asfreq('W'), ival_T_to_W) - assert_equal(ival_T.asfreq('D'), ival_T_to_D) - assert_equal(ival_T_end_of_day.asfreq('D'), ival_T_to_D) - assert_equal(ival_T.asfreq('B'), ival_T_to_B) - assert_equal(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) - assert_equal(ival_T.asfreq('H'), ival_T_to_H) - assert_equal(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) - - assert_equal(ival_T.asfreq('S', 'S'), ival_T_to_S_start) - assert_equal(ival_T.asfreq('S', 'E'), ival_T_to_S_end) - - assert_equal(ival_T.asfreq('Min'), ival_T) + self.assertEqual(ival_T.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + self.assertEqual(ival_T.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + self.assertEqual(ival_T.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + self.assertEqual(ival_T.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T_end_of_week.asfreq('W'), ival_T_to_W) + self.assertEqual(ival_T.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + self.assertEqual(ival_T.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + self.assertEqual(ival_T.asfreq('H'), ival_T_to_H) + self.assertEqual(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + self.assertEqual(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + self.assertEqual(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + self.assertEqual(ival_T.asfreq('Min'), ival_T) def test_conv_secondly(self): # frequency conversion tests: from Secondly Frequency" @@ -1504,24 +1509,24 @@ def test_conv_secondly(self): ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, hour=0, minute=0) - assert_equal(ival_S.asfreq('A'), ival_S_to_A) - assert_equal(ival_S_end_of_year.asfreq('A'), ival_S_to_A) - assert_equal(ival_S.asfreq('Q'), ival_S_to_Q) - assert_equal(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) - assert_equal(ival_S.asfreq('M'), ival_S_to_M) - assert_equal(ival_S_end_of_month.asfreq('M'), ival_S_to_M) - assert_equal(ival_S.asfreq('W'), ival_S_to_W) - assert_equal(ival_S_end_of_week.asfreq('W'), ival_S_to_W) - assert_equal(ival_S.asfreq('D'), ival_S_to_D) - assert_equal(ival_S_end_of_day.asfreq('D'), ival_S_to_D) - assert_equal(ival_S.asfreq('B'), ival_S_to_B) - assert_equal(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) - assert_equal(ival_S.asfreq('H'), ival_S_to_H) - assert_equal(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) - assert_equal(ival_S.asfreq('Min'), ival_S_to_T) - assert_equal(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) - - assert_equal(ival_S.asfreq('S'), ival_S) + self.assertEqual(ival_S.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + self.assertEqual(ival_S.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + self.assertEqual(ival_S.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + self.assertEqual(ival_S.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S_end_of_week.asfreq('W'), ival_S_to_W) + self.assertEqual(ival_S.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + self.assertEqual(ival_S.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + self.assertEqual(ival_S.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + self.assertEqual(ival_S.asfreq('Min'), ival_S_to_T) + self.assertEqual(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + self.assertEqual(ival_S.asfreq('S'), ival_S) def test_asfreq_nat(self): p = Period('NaT', freq='A') @@ -2246,52 +2251,52 @@ def test_index_unique(self): def test_constructor(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 9) + self.assertEqual(len(pi), 9) pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 4 * 9) + self.assertEqual(len(pi), 4 * 9) pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 12 * 9) + self.assertEqual(len(pi), 12 * 9) pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') - assert_equal(len(pi), 365 * 9 + 2) + self.assertEqual(len(pi), 365 * 9 + 2) pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') - assert_equal(len(pi), 261 * 9) + self.assertEqual(len(pi), 261 * 9) pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') - assert_equal(len(pi), 365 * 24) + self.assertEqual(len(pi), 365 * 24) pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') - assert_equal(len(pi), 24 * 60) + self.assertEqual(len(pi), 24 * 60) pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') - assert_equal(len(pi), 24 * 60 * 60) + self.assertEqual(len(pi), 24 * 60 * 60) start = Period('02-Apr-2005', 'B') i1 = PeriodIndex(start=start, periods=20) - assert_equal(len(i1), 20) - assert_equal(i1.freq, start.freq) - assert_equal(i1[0], start) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) end_intv = Period('2006-12-31', 'W') i1 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), 10) - assert_equal(i1.freq, end_intv.freq) - assert_equal(i1[-1], end_intv) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) end_intv = Period('2006-12-31', '1w') i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) end_intv = Period('2006-12-31', ('w', 1)) i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) try: PeriodIndex(start=start, end=end_intv) @@ -2311,12 +2316,12 @@ def test_constructor(self): # infer freq from first element i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) # Mixed freq should fail vals = [end_intv, Period('2006-12-31', 'w')] @@ -2352,33 +2357,33 @@ def test_shift(self): tm.assert_index_equal(pi1.shift(0), pi1) - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(1), pi2) pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') - assert_equal(len(pi1), len(pi2)) - assert_equal(pi1.shift(-1).values, pi2.values) + self.assertEqual(len(pi1), len(pi2)) + self.assert_index_equal(pi1.shift(-1), pi2) def test_shift_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', @@ -2496,37 +2501,37 @@ def test_asfreq_mult_pi(self): def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 9) + self.assertEqual(len(pi), 9) pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 4 * 9) + self.assertEqual(len(pi), 4 * 9) pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') - assert_equal(len(pi), 12 * 9) + self.assertEqual(len(pi), 12 * 9) start = Period('02-Apr-2005', 'B') i1 = PeriodIndex(start=start, periods=20) - assert_equal(len(i1), 20) - assert_equal(i1.freq, start.freq) - assert_equal(i1[0], start) + self.assertEqual(len(i1), 20) + self.assertEqual(i1.freq, start.freq) + self.assertEqual(i1[0], start) end_intv = Period('2006-12-31', 'W') i1 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), 10) - assert_equal(i1.freq, end_intv.freq) - assert_equal(i1[-1], end_intv) + self.assertEqual(len(i1), 10) + self.assertEqual(i1.freq, end_intv.freq) + self.assertEqual(i1[-1], end_intv) end_intv = Period('2006-12-31', '1w') i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) end_intv = Period('2006-12-31', ('w', 1)) i2 = PeriodIndex(end=end_intv, periods=10) - assert_equal(len(i1), len(i2)) + self.assertEqual(len(i1), len(i2)) self.assertTrue((i1 == i2).all()) - assert_equal(i1.freq, i2.freq) + self.assertEqual(i1.freq, i2.freq) try: PeriodIndex(start=start, end=end_intv) @@ -2546,12 +2551,12 @@ def test_period_index_length(self): # infer freq from first element i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) - assert_equal(len(i2), 2) - assert_equal(i2[0], end_intv) + self.assertEqual(len(i2), 2) + self.assertEqual(i2[0], end_intv) # Mixed freq should fail vals = [end_intv, Period('2006-12-31', 'w')] @@ -3124,9 +3129,9 @@ def _check_all_fields(self, periodindex): for field in fields: field_idx = getattr(periodindex, field) - assert_equal(len(periodindex), len(field_idx)) + self.assertEqual(len(periodindex), len(field_idx)) for x, val in zip(periods, field_idx): - assert_equal(getattr(x, field), val) + self.assertEqual(getattr(x, field), val) def test_is_full(self): index = PeriodIndex([2005, 2007, 2009], freq='A') @@ -3327,8 +3332,8 @@ class TestMethods(tm.TestCase): def test_add(self): dt1 = Period(freq='D', year=2008, month=1, day=1) dt2 = Period(freq='D', year=2008, month=1, day=2) - assert_equal(dt1 + 1, dt2) - assert_equal(1 + dt1, dt2) + self.assertEqual(dt1 + 1, dt2) + self.assertEqual(1 + dt1, dt2) def test_add_pdnat(self): p = pd.Period('2011-01', freq='M') diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 0284df9e58933..67df62e1ebb57 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -4,8 +4,6 @@ from pandas.compat import lrange, zip import numpy as np -from numpy.testing.decorators import slow - from pandas import Index, Series, DataFrame from pandas.tseries.index import date_range, bdate_range @@ -13,7 +11,7 @@ from pandas.tseries.period import period_range, Period, PeriodIndex from pandas.tseries.resample import DatetimeIndex -from pandas.util.testing import assert_series_equal, ensure_clean +from pandas.util.testing import assert_series_equal, ensure_clean, slow import pandas.util.testing as tm from pandas.tests.test_graphics import _skip_if_no_scipy_gaussian_kde diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 8d02c43e68be3..20098488f7f1c 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -16,7 +16,6 @@ from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct from pandas.util.testing import (assert_series_equal, assert_frame_equal, assert_almost_equal, assert_index_equal) -from numpy.testing import assert_allclose from pandas.tseries.offsets import Day, Second import pandas.util.testing as tm from numpy.random import randn @@ -1224,7 +1223,7 @@ def test_total_seconds(self): freq='s') expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] - assert_allclose(rng.total_seconds(), expt, atol=1e-10, rtol=0) + tm.assert_almost_equal(rng.total_seconds(), expt) # test Series s = Series(rng) @@ -1239,14 +1238,14 @@ def test_total_seconds(self): # with both nat s = Series([np.nan, np.nan], dtype='timedelta64[ns]') - tm.assert_series_equal(s.dt.total_seconds(), Series( - [np.nan, np.nan], index=[0, 1])) + tm.assert_series_equal(s.dt.total_seconds(), + Series([np.nan, np.nan], index=[0, 1])) def test_total_seconds_scalar(self): # GH 10939 rng = Timedelta('1 days, 10:11:12.100123456') expt = 1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9 - assert_allclose(rng.total_seconds(), expt, atol=1e-10, rtol=0) + tm.assert_almost_equal(rng.total_seconds(), expt) rng = Timedelta(np.nan) self.assertTrue(np.isnan(rng.total_seconds())) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 1564c0a81585e..3a3315ed3890c 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -5,7 +5,6 @@ import warnings from datetime import datetime, time, timedelta from numpy.random import rand -from numpy.testing.decorators import slow import nose import numpy as np @@ -31,7 +30,7 @@ from pandas.tslib import iNaT from pandas.util.testing import ( assert_frame_equal, assert_series_equal, assert_almost_equal, - _skip_if_has_locale) + _skip_if_has_locale, slow) randn = np.random.randn @@ -1110,8 +1109,8 @@ def test_asfreq_keep_index_name(self): index = pd.date_range('20130101', periods=20, name=index_name) df = pd.DataFrame([x for x in range(20)], columns=['foo'], index=index) - tm.assert_equal(index_name, df.index.name) - tm.assert_equal(index_name, df.asfreq('10D').index.name) + self.assertEqual(index_name, df.index.name) + self.assertEqual(index_name, df.asfreq('10D').index.name) def test_promote_datetime_date(self): rng = date_range('1/1/2000', periods=20) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index dd66d732ba684..e39dc441bcca4 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -19,6 +19,7 @@ from distutils.version import LooseVersion from numpy.random import randn, rand +from numpy.testing.decorators import slow # noqa import numpy as np import pandas as pd From a67ac2add4a11eb17e03a55bc1e7bc24818e58cf Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 May 2016 09:51:14 -0400 Subject: [PATCH 56/96] COMPAT: extension dtypes (DatetimeTZ, Categorical) are now Singleton cached objects allows for proper is / == comparisons closes #13285 --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/tests/types/test_dtypes.py | 24 +++++++++++++ pandas/types/dtypes.py | 56 ++++++++++++++++++++++++------- 3 files changed, 68 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index d7918152ad0d9..eeee85de4b5b6 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -246,6 +246,7 @@ Bug Fixes - Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) - Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 2a9ad30a07805..d48b9baf64777 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -45,6 +45,16 @@ class TestCategoricalDtype(Base, tm.TestCase): def setUp(self): self.dtype = CategoricalDtype() + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = CategoricalDtype() + self.assertTrue(dtype == dtype2) + self.assertTrue(dtype2 == dtype) + self.assertTrue(dtype is dtype2) + self.assertTrue(dtype2 is dtype) + self.assertTrue(hash(dtype) == hash(dtype2)) + def test_equality(self): self.assertTrue(is_dtype_equal(self.dtype, 'category')) self.assertTrue(is_dtype_equal(self.dtype, CategoricalDtype())) @@ -88,6 +98,20 @@ class TestDatetimeTZDtype(Base, tm.TestCase): def setUp(self): self.dtype = DatetimeTZDtype('ns', 'US/Eastern') + def test_hash_vs_equality(self): + # make sure that we satisfy is semantics + dtype = self.dtype + dtype2 = DatetimeTZDtype('ns', 'US/Eastern') + dtype3 = DatetimeTZDtype(dtype2) + self.assertTrue(dtype == dtype2) + self.assertTrue(dtype2 == dtype) + self.assertTrue(dtype3 == dtype) + self.assertTrue(dtype is dtype2) + self.assertTrue(dtype2 is dtype) + self.assertTrue(dtype3 is dtype) + self.assertTrue(hash(dtype) == hash(dtype2)) + self.assertTrue(hash(dtype) == hash(dtype3)) + def test_construction(self): self.assertRaises(ValueError, lambda: DatetimeTZDtype('ms', 'US/Eastern')) diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index e6adbc8500117..140d494c3e1b2 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -108,6 +108,16 @@ class CategoricalDtype(ExtensionDtype): kind = 'O' str = '|O08' base = np.dtype('O') + _cache = {} + + def __new__(cls): + + try: + return cls._cache[cls.name] + except KeyError: + c = object.__new__(cls) + cls._cache[cls.name] = c + return c def __hash__(self): # make myself hashable @@ -155,9 +165,11 @@ class DatetimeTZDtype(ExtensionDtype): base = np.dtype('M8[ns]') _metadata = ['unit', 'tz'] _match = re.compile("(datetime64|M8)\[(?P.+), (?P.+)\]") + _cache = {} + + def __new__(cls, unit=None, tz=None): + """ Create a new unit if needed, otherwise return from the cache - def __init__(self, unit, tz=None): - """ Parameters ---------- unit : string unit that this represents, currently must be 'ns' @@ -165,28 +177,46 @@ def __init__(self, unit, tz=None): """ if isinstance(unit, DatetimeTZDtype): - self.unit, self.tz = unit.unit, unit.tz - return + unit, tz = unit.unit, unit.tz - if tz is None: + elif unit is None: + # we are called as an empty constructor + # generally for pickle compat + return object.__new__(cls) + + elif tz is None: # we were passed a string that we can construct try: - m = self._match.search(unit) + m = cls._match.search(unit) if m is not None: - self.unit = m.groupdict()['unit'] - self.tz = m.groupdict()['tz'] - return + unit = m.groupdict()['unit'] + tz = m.groupdict()['tz'] except: raise ValueError("could not construct DatetimeTZDtype") + elif isinstance(unit, compat.string_types): + + if unit != 'ns': + raise ValueError("DatetimeTZDtype only supports ns units") + + unit = unit + tz = tz + + if tz is None: raise ValueError("DatetimeTZDtype constructor must have a tz " "supplied") - if unit != 'ns': - raise ValueError("DatetimeTZDtype only supports ns units") - self.unit = unit - self.tz = tz + # set/retrieve from cache + key = (unit, str(tz)) + try: + return cls._cache[key] + except KeyError: + u = object.__new__(cls) + u.unit = unit + u.tz = tz + cls._cache[key] = u + return u @classmethod def construct_from_string(cls, string): From 5d6772074a89c1ed7a5c24b078215cb7f9cc6eb3 Mon Sep 17 00:00:00 2001 From: Amol Date: Thu, 26 May 2016 20:14:51 +0200 Subject: [PATCH 57/96] DOC: Added an example of pitfalls when using astype - [x] closes #13260 - [ ] tests added / passed - [ ] passes ``git diff upstream/master | flake8 --diff`` - [ ] whatsnew entry Author: Amol Closes #13278 from pfrcks/bug13260 and squashes the following commits: 035a177 [Amol] DOC: Final touches c30209d [Amol] DOC: Cleaning up 278e922 [Amol] DOC: Some cleaning up e1877bf [Amol] DOC: Restructured the documentation f394045 [Amol] DOC: Cleaned up the documentation 705e1a5 [Amol] DOC: Added an example of pitfalls when using astype --- doc/source/basics.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index e3b0915cd571d..917d2f2bb8b04 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1726,6 +1726,28 @@ then the more *general* one will be used as the result of the operation. # conversion of dtypes df3.astype('float32').dtypes +Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` + +.. ipython:: python + + dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft[['a','b']] = dft[['a','b']].astype(np.uint8) + dft + dft.dtypes + +.. note:: + + When trying to convert a subset of columns to a specified type using :meth:`~DataFrame.astype` and :meth:`~DataFrame.loc`, upcasting occurs. + + :meth:`~DataFrame.loc` tries to fit in what we are assigning to the current dtypes, while ``[]`` will overwrite them taking the dtype from the right hand side. Therefore the following piece of code produces the unintended result. + + .. ipython:: python + + dft = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c': [7, 8, 9]}) + dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes + dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) + dft.dtypes + object conversion ~~~~~~~~~~~~~~~~~ From 456dcae11bd64e22dbdb6a89764ffa69097a95b4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 May 2016 15:00:42 -0400 Subject: [PATCH 58/96] TST: skip Fred / YahooOptions tests --- pandas/io/tests/test_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 6845eb009df5d..1efa8b13598a7 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -302,6 +302,8 @@ class TestYahooOptions(tm.TestCase): @classmethod def setUpClass(cls): super(TestYahooOptions, cls).setUpClass() + raise nose.SkipTest('disable Yahoo Options tests') + _skip_if_no_lxml() _skip_if_no_bs() raise nose.SkipTest('unreliable test') @@ -500,6 +502,12 @@ def test_read_famafrench(self): class TestFred(tm.TestCase): + + @classmethod + def setUpClass(cls): + super(TestFred, cls).setUpClass() + raise nose.SkipTest('disable Fred tests') + @network def test_fred(self): raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') From db4382412d0e3ad97b7e0e986656a6e52a498c8d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 May 2016 18:27:37 -0400 Subject: [PATCH 59/96] TST: split up test_merge very small bug fix w.r.t. tz concatting Author: Jeff Reback Closes #13300 from jreback/merges and squashes the following commits: 727dfe9 [Jeff Reback] BUG: concat of same-tz needs string comparison for uniqueness 1560136 [Jeff Reback] TST: split up test_merge --- ci/requirements-3.4.run | 2 +- pandas/tools/tests/test_concat.py | 1035 ++++++++++++++++++++ pandas/tools/tests/test_merge.py | 1113 +--------------------- pandas/tools/tests/test_ordered_merge.py | 93 ++ pandas/types/concat.py | 2 +- 5 files changed, 1138 insertions(+), 1107 deletions(-) create mode 100644 pandas/tools/tests/test_concat.py create mode 100644 pandas/tools/tests/test_ordered_merge.py diff --git a/ci/requirements-3.4.run b/ci/requirements-3.4.run index 7d4cdcd21595a..3e12adae7dd9f 100644 --- a/ci/requirements-3.4.run +++ b/ci/requirements-3.4.run @@ -1,4 +1,4 @@ -pytz +pytz=2015.7 numpy=1.8.1 openpyxl xlsxwriter diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py new file mode 100644 index 0000000000000..62bd12130ca53 --- /dev/null +++ b/pandas/tools/tests/test_concat.py @@ -0,0 +1,1035 @@ +import nose + +import numpy as np +from numpy.random import randn + +from datetime import datetime +from pandas.compat import StringIO +import pandas as pd +from pandas import (DataFrame, concat, + read_csv, isnull, Series, date_range, + Index, Panel, MultiIndex, Timestamp, + DatetimeIndex) +from pandas.util import testing as tm +from pandas.util.testing import (assert_frame_equal, + makeCustomDataframe as mkdf, + assert_almost_equal) + + +class TestConcatenate(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.frame = DataFrame(tm.getSeriesData()) + self.mixed_frame = self.frame.copy() + self.mixed_frame['foo'] = 'bar' + + def test_append(self): + begin_index = self.frame.index[:5] + end_index = self.frame.index[5:] + + begin_frame = self.frame.reindex(begin_index) + end_frame = self.frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + assert_almost_equal(appended['A'], self.frame['A']) + + del end_frame['A'] + partial_appended = begin_frame.append(end_frame) + self.assertIn('A', partial_appended) + + partial_appended = end_frame.append(begin_frame) + self.assertIn('A', partial_appended) + + # mixed type handling + appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) + assert_frame_equal(appended, self.mixed_frame) + + # what to test here + mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + + # all equal except 'foo' column + assert_frame_equal( + mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), + mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + + # append empty + empty = DataFrame({}) + + appended = self.frame.append(empty) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + appended = empty.append(self.frame) + assert_frame_equal(self.frame, appended) + self.assertIsNot(appended, self.frame) + + # overlap + self.assertRaises(ValueError, self.frame.append, self.frame, + verify_integrity=True) + + # new columns + # GH 6129 + df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) + row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') + expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { + 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) + result = df.append(row) + assert_frame_equal(result, expected) + + def test_append_length0_frame(self): + df = DataFrame(columns=['A', 'B', 'C']) + df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df5 = df.append(df3) + + expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,), dtype=('i4,f4,a10')) + arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] + + arr2 = np.zeros((3,), dtype=('i4,f4,a10')) + arr2[:] = [(3, 4., 'foo'), + (5, 6., "bar"), + (7., 8., 'baz')] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + assert_frame_equal(result, expected) + + def test_append_different_columns(self): + df = DataFrame({'bools': np.random.randn(10) > 0, + 'ints': np.random.randint(0, 10, 10), + 'floats': np.random.randn(10), + 'strings': ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assertTrue(isnull(appended['strings'][0:4]).all()) + self.assertTrue(isnull(appended['bools'][5:]).all()) + + def test_append_many(self): + chunks = [self.frame[:5], self.frame[5:10], + self.frame[10:15], self.frame[15:]] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, self.frame) + + chunks[-1] = chunks[-1].copy() + chunks[-1]['foo'] = 'bar' + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) + self.assertTrue((result['foo'][15:] == 'bar').all()) + self.assertTrue(result['foo'][:15].isnull().all()) + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(data=None, columns=['A', 'B', 'C']) + df1 = df1.set_index(['A']) + df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], + columns=['A', 'B', 'C']) + df2 = df2.set_index(['A']) + + result = df1.append(df2) + self.assertEqual(result.index.name, 'A') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_append_missing_column_proper_upcast(self): + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) + df2 = DataFrame({'B': np.array([True, False, True, False], + dtype=bool)}) + + appended = df1.append(df2, ignore_index=True) + self.assertEqual(appended['A'].dtype, 'f8') + self.assertEqual(appended['B'].dtype, 'O') + + def test_concat_copy(self): + + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) + df3 = DataFrame({5: 'foo'}, index=range(4)) + + # these are actual copies + result = concat([df, df2, df3], axis=1, copy=True) + for b in result._data.blocks: + self.assertIsNone(b.values.base) + + # these are the same + result = concat([df, df2, df3], axis=1, copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertTrue( + b.values.base is df._data.blocks[0].values.base) + elif b.is_integer: + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + + # float block was consolidated + df4 = DataFrame(np.random.randn(4, 1)) + result = concat([df, df2, df3, df4], axis=1, copy=False) + for b in result._data.blocks: + if b.is_float: + self.assertIsNone(b.values.base) + elif b.is_integer: + self.assertTrue( + b.values.base is df2._data.blocks[0].values.base) + elif b.is_object: + self.assertIsNotNone(b.values.base) + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 0, 1, 2, 3]]) + expected = DataFrame(np.r_[df.values, df2.values], + index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], + index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], + columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], + columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + level = ['three', 'two', 'one', 'zero'] + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[level], + names=['group_key']) + + self.assert_numpy_array_equal(result.columns.levels[0], level) + self.assertEqual(result.columns.names[0], 'group_key') + + def test_concat_dataframe_keys_bug(self): + t1 = DataFrame({ + 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], + name='id'))}) + t2 = DataFrame({ + 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) + + # it works + result = concat([t1, t2], axis=1, keys=['t1', 't2']) + self.assertEqual(list(result.columns), [('t1', 'value'), + ('t2', 'value')]) + + def test_concat_series_partial_columns_names(self): + # GH10698 + foo = Series([1, 2], name='foo') + bar = Series([1, 2]) + baz = Series([4, 5]) + + result = concat([foo, bar, baz], axis=1) + expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ + 4, 5]}, columns=['foo', 0, 1]) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, keys=[ + 'red', 'blue', 'yellow']) + expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ + 4, 5]}, columns=['red', 'blue', 'yellow']) + tm.assert_frame_equal(result, expected) + + result = concat([foo, bar, baz], axis=1, ignore_index=True) + expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) + tm.assert_frame_equal(result, expected) + + def test_concat_dict(self): + frames = {'foo': DataFrame(np.random.randn(4, 3)), + 'bar': DataFrame(np.random.randn(4, 3)), + 'baz': DataFrame(np.random.randn(4, 3)), + 'qux': DataFrame(np.random.randn(4, 3))} + + sorted_keys = sorted(frames) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, + axis=1) + tm.assert_frame_equal(result, expected) + + keys = ['baz', 'foo', 'bar'] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_ignore_index(self): + frame1 = DataFrame({"test1": ["a", "b", "c"], + "test2": [1, 2, 3], + "test3": [4.5, 3.2, 1.2]}) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True) + + nan = np.nan + expected = DataFrame([[nan, nan, nan, 4.3], + ['a', 1, 4.5, 5.2], + ['b', 2, 3.2, 2.2], + ['c', 3, 1.2, nan]], + index=Index(["q", "x", "y", "z"])) + + tm.assert_frame_equal(v1, expected) + + def test_concat_multiindex_with_keys(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + result = concat([frame, frame], keys=[0, 1], names=['iteration']) + + self.assertEqual(result.index.names, ('iteration',) + index.names) + tm.assert_frame_equal(result.ix[0], frame) + tm.assert_frame_equal(result.ix[1], frame) + self.assertEqual(result.index.nlevels, 3) + + def test_concat_multiindex_with_tz(self): + # GH 6606 + df = DataFrame({'dt': [datetime(2014, 1, 1), + datetime(2014, 1, 2), + datetime(2014, 1, 3)], + 'b': ['A', 'B', 'C'], + 'c': [1, 2, 3], 'd': [4, 5, 6]}) + df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) + df = df.set_index(['dt', 'b']) + + exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', + '2014-01-03'] * 2, + tz='US/Pacific', name='dt') + exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, + index=exp_idx, columns=['c', 'd']) + + result = concat([df, df]) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [['foo', 'baz'], ['one', 'two']] + names = ['first', 'second'] + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels, + names=names) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex(levels=levels + [[0]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1], + [0, 0, 0, 0]], + names=names + [None]) + expected.index = exp_index + + assert_frame_equal(result, expected) + + # no names + + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels) + self.assertEqual(result.index.names, (None,) * 3) + + # no levels + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second') + (None,)) + self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + + self.assertRaises(ValueError, concat, [df, df], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + self.assertRaises(ValueError, concat, [df, df2], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + def test_concat_rename_index(self): + a = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_a')) + b = DataFrame(np.random.rand(3, 3), + columns=list('ABC'), + index=Index(list('abc'), name='index_b')) + + result = concat([a, b], keys=['key0', 'key1'], + names=['lvl0', 'lvl1']) + + exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + names = list(exp.index.names) + names[1] = 'lvl1' + exp.index.set_names(names, inplace=True) + + tm.assert_frame_equal(result, exp) + self.assertEqual(result.index.names, exp.index.names) + + def test_crossed_dtypes_weird_corner(self): + columns = ['A', 'B', 'C', 'D'] + df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), + 'B': np.array([1, 2, 3, 4], dtype='i8'), + 'C': np.array([1, 2, 3, 4], dtype='f8'), + 'D': np.array([1, 2, 3, 4], dtype='i8')}, + columns=columns) + + df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), + 'B': np.array([1, 2, 3, 4], dtype='f8'), + 'C': np.array([1, 2, 3, 4], dtype='i8'), + 'D': np.array([1, 2, 3, 4], dtype='f8')}, + columns=columns) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), + columns=columns) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + result = concat( + [df, df2], keys=['one', 'two'], names=['first', 'second']) + self.assertEqual(result.index.names, ('first', 'second')) + + def test_dups_index(self): + # GH 4771 + + # single dtypes + df = DataFrame(np.random.randint(0, 10, size=40).reshape( + 10, 4), columns=['A', 'A', 'C', 'C']) + + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :4], df) + assert_frame_equal(result.iloc[:, 4:], df) + + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) + + # multi dtypes + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + result = concat([df, df], axis=1) + assert_frame_equal(result.iloc[:, :6], df) + assert_frame_equal(result.iloc[:, 6:], df) + + result = concat([df, df], axis=0) + assert_frame_equal(result.iloc[:10], df) + assert_frame_equal(result.iloc[10:], df) + + # append + result = df.iloc[0:8, :].append(df.iloc[8:]) + assert_frame_equal(result, df) + + result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) + assert_frame_equal(result, df) + + expected = concat([df, df], axis=0) + result = df.append(df) + assert_frame_equal(result, expected) + + def test_with_mixed_tuples(self): + # 10697 + # columns have mixed tuples, so handle properly + df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) + df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) + + # it works + concat([df1, df2]) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) + + def test_handle_empty_objects(self): + df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + + baz = df[:5].copy() + baz['foo'] = 'bar' + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0) + + expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] + expected['foo'] = expected['foo'].astype('O') + expected.loc[0:4, 'foo'] = 'bar' + + tm.assert_frame_equal(concatted, expected) + + # empty as first element with time series + # GH3259 + df = DataFrame(dict(A=range(10000)), index=date_range( + '20130101', periods=10000, freq='s')) + empty = DataFrame() + result = concat([df, empty], axis=1) + assert_frame_equal(result, df) + result = concat([empty, df], axis=1) + assert_frame_equal(result, df) + + result = concat([df, empty]) + assert_frame_equal(result, df) + result = concat([empty, df]) + assert_frame_equal(result, df) + + def test_concat_mixed_objs(self): + + # concat mixed series/frames + # G2385 + + # axis 1 + index = date_range('01-Jan-2013', periods=10, freq='H') + arr = np.arange(10, dtype='int64') + s1 = Series(arr, index=index) + s2 = Series(arr, index=index) + df = DataFrame(arr.reshape(-1, 1), index=index) + + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 0]) + result = concat([df, df], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), + index=index, columns=[0, 1]) + result = concat([s1, s2], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, s2, s1], axis=1) + assert_frame_equal(result, expected) + + expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), + index=index, columns=[0, 0, 1, 2, 3]) + result = concat([s1, df, s2, s2, s1], axis=1) + assert_frame_equal(result, expected) + + # with names + s1.name = 'foo' + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 0]) + result = concat([s1, df, s2], axis=1) + assert_frame_equal(result, expected) + + s2.name = 'bar' + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=['foo', 0, 'bar']) + result = concat([s1, df, s2], axis=1) + assert_frame_equal(result, expected) + + # ignore index + expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), + index=index, columns=[0, 1, 2]) + result = concat([s1, df, s2], axis=1, ignore_index=True) + assert_frame_equal(result, expected) + + # axis 0 + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), + index=index.tolist() * 3, columns=[0]) + result = concat([s1, df, s2]) + assert_frame_equal(result, expected) + + expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) + result = concat([s1, df, s2], ignore_index=True) + assert_frame_equal(result, expected) + + # invalid concatente of mixed dims + panel = tm.makePanel() + self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iteritems()) + + joined = panels[0].join(panels[1:], how='inner') + expected = Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + def test_panel_concat_other_axes(self): + panel = tm.makePanel() + + p1 = panel.ix[:, :5, :] + p2 = panel.ix[:, 5:, :] + + result = concat([p1, p2], axis=1) + tm.assert_panel_equal(result, panel) + + p1 = panel.ix[:, :, :2] + p2 = panel.ix[:, :, 2:] + + result = concat([p1, p2], axis=2) + tm.assert_panel_equal(result, panel) + + # if things are a bit misbehaved + p1 = panel.ix[:2, :, :2] + p2 = panel.ix[:, :, 2:] + p1['ItemC'] = 'baz' + + result = concat([p1, p2], axis=2) + + expected = panel.copy() + expected['ItemC'] = expected['ItemC'].astype('O') + expected.ix['ItemC', :, :2] = 'baz' + tm.assert_panel_equal(result, expected) + + def test_panel_concat_buglet(self): + # #2257 + def make_panel(): + index = 5 + cols = 3 + + def df(): + return DataFrame(np.random.randn(index, cols), + index=["I%s" % i for i in range(index)], + columns=["C%s" % i for i in range(cols)]) + return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) + + panel1 = make_panel() + panel2 = make_panel() + + panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) + for x in panel2.major_axis]), + axis=1) + + panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) + panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) + + # it works! + concat([panel1, panel3], axis=1, verify_integrity=True) + + def test_panel4d_concat(self): + p4d = tm.makePanel4D() + + p1 = p4d.ix[:, :, :5, :] + p2 = p4d.ix[:, :, 5:, :] + + result = concat([p1, p2], axis=2) + tm.assert_panel4d_equal(result, p4d) + + p1 = p4d.ix[:, :, :, :2] + p2 = p4d.ix[:, :, :, 2:] + + result = concat([p1, p2], axis=3) + tm.assert_panel4d_equal(result, p4d) + + def test_panel4d_concat_mixed_type(self): + p4d = tm.makePanel4D() + + # if things are a bit misbehaved + p1 = p4d.ix[:, :2, :, :2] + p2 = p4d.ix[:, :, :, 2:] + p1['L5'] = 'baz' + + result = concat([p1, p2], axis=3) + + p2['L5'] = np.nan + expected = concat([p1, p2], axis=3) + expected = expected.ix[result.labels] + + tm.assert_panel4d_equal(result, expected) + + def test_concat_series(self): + + ts = tm.makeTimeSeries() + ts.name = 'foo' + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + self.assertEqual(result.name, ts.name) + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + + exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), + np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], + labels=exp_labels) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + assert_frame_equal(result, expected) + + result = concat(pieces, keys=['A', 'B', 'C'], axis=1) + expected = DataFrame(pieces, index=['A', 'B', 'C']).T + assert_frame_equal(result, expected) + + # preserve series names, #2489 + s = Series(randn(5), name='A') + s2 = Series(randn(5), name='B') + + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + s2.name = None + result = concat([s, s2], axis=1) + self.assertTrue(np.array_equal( + result.columns, Index(['A', 0], dtype='object'))) + + # must reindex, #2603 + s = Series(randn(3), index=['c', 'a', 'b'], name='A') + s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') + result = concat([s, s2], axis=1) + expected = DataFrame({'A': s, 'B': s2}) + assert_frame_equal(result, expected) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=['foo']) + expected = concat([df, df], keys=['foo', 'bar']) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + self.assertRaises(ValueError, concat, [None, None]) + + def test_concat_datetime64_block(self): + from pandas.tseries.index import date_range + + rng = date_range('1/1/2000', periods=10) + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_timedelta64_block(self): + from pandas import to_timedelta + + rng = to_timedelta(np.arange(10), unit='s') + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assertTrue((result.iloc[:10]['time'] == rng).all()) + self.assertTrue((result.iloc[10:]['time'] == rng).all()) + + def test_concat_keys_with_none(self): + # #1649 + df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) + + result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) + expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) + tm.assert_frame_equal(result, expected) + + result = concat([None, df0, df0[:2], df0[:1], df0], + keys=['a', 'b', 'c', 'd', 'e']) + expected = concat([df0, df0[:2], df0[:1], df0], + keys=['b', 'c', 'd', 'e']) + tm.assert_frame_equal(result, expected) + + def test_concat_bug_1719(self): + ts1 = tm.makeTimeSeries() + ts2 = tm.makeTimeSeries()[::2] + + # to join with union + # these two are of different length! + left = concat([ts1, ts2], join='outer', axis=1) + right = concat([ts2, ts1], join='outer', axis=1) + + self.assertEqual(len(left), len(right)) + + def test_concat_bug_2972(self): + ts0 = Series(np.zeros(5)) + ts1 = Series(np.ones(5)) + ts0.name = ts1.name = 'same name' + result = concat([ts0, ts1], axis=1) + + expected = DataFrame({0: ts0, 1: ts1}) + expected.columns = ['same name', 'same name'] + assert_frame_equal(result, expected) + + def test_concat_bug_3602(self): + + # GH 3602, duplicate columns + df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ + 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) + df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ + 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) + expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], + [0, 6, 'rrr', 10, 2, 6], + [0, 6, 'rrr', 11, 3, 6], + [0, 6, 'rrr', 12, 4, 6]]) + expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] + + result = concat([df1, df2], axis=1) + assert_frame_equal(result, expected) + + def test_concat_series_axis1_same_names_ignore_index(self): + dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] + s1 = Series(randn(len(dates)), index=dates, name='value') + s2 = Series(randn(len(dates)), index=dates, name='value') + + result = concat([s1, s2], axis=1, ignore_index=True) + self.assertTrue(np.array_equal(result.columns, [0, 1])) + + def test_concat_iterables(self): + from collections import deque, Iterable + + # GH8645 check concat works with tuples, list, generators, and weird + # stuff like deque and custom iterables + df1 = DataFrame([1, 2, 3]) + df2 = DataFrame([4, 5, 6]) + expected = DataFrame([1, 2, 3, 4, 5, 6]) + assert_frame_equal(concat((df1, df2), ignore_index=True), expected) + assert_frame_equal(concat([df1, df2], ignore_index=True), expected) + assert_frame_equal(concat((df for df in (df1, df2)), + ignore_index=True), expected) + assert_frame_equal( + concat(deque((df1, df2)), ignore_index=True), expected) + + class CustomIterator1(object): + + def __len__(self): + return 2 + + def __getitem__(self, index): + try: + return {0: df1, 1: df2}[index] + except KeyError: + raise IndexError + assert_frame_equal(pd.concat(CustomIterator1(), + ignore_index=True), expected) + + class CustomIterator2(Iterable): + + def __iter__(self): + yield df1 + yield df2 + assert_frame_equal(pd.concat(CustomIterator2(), + ignore_index=True), expected) + + def test_concat_invalid(self): + + # trying to concat a ndframe with a non-ndframe + df1 = mkdf(10, 2) + for obj in [1, dict(), [1, 2], (1, 2)]: + self.assertRaises(TypeError, lambda x: concat([df1, obj])) + + def test_concat_invalid_first_argument(self): + df1 = mkdf(10, 2) + df2 = mkdf(10, 2) + self.assertRaises(TypeError, concat, df1, df2) + + # generator ok though + concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) + + # text reader ok + # GH6583 + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + + reader = read_csv(StringIO(data), chunksize=1) + result = concat(reader, ignore_index=True) + expected = read_csv(StringIO(data)) + assert_frame_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 474ce0f899217..2f3a8f77af09b 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,17 +9,14 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip, StringIO -from pandas import compat -from pandas.tseries.index import DatetimeIndex -from pandas.tools.merge import merge, concat, ordered_merge, MergeError -from pandas import Categorical, Timestamp -from pandas.util.testing import (assert_frame_equal, assert_series_equal, - assert_almost_equal, - makeCustomDataframe as mkdf, - assertRaisesRegexp, slow) -from pandas import (isnull, DataFrame, Index, MultiIndex, Panel, - Series, date_range, read_csv) +from pandas.compat import range, lrange, lzip +from pandas.tools.merge import merge, concat, MergeError +from pandas.util.testing import (assert_frame_equal, + assert_series_equal, + slow) +from pandas import (DataFrame, Index, MultiIndex, + Series, date_range, Categorical, + compat) import pandas.algos as algos import pandas.util.testing as tm @@ -2159,1100 +2156,6 @@ def _join_by_hand(a, b, how='left'): return a_re.reindex(columns=result_columns) -class TestConcatenate(tm.TestCase): - - _multiprocess_can_split_ = True - - def setUp(self): - self.frame = DataFrame(tm.getSeriesData()) - self.mixed_frame = self.frame.copy() - self.mixed_frame['foo'] = 'bar' - - def test_append(self): - begin_index = self.frame.index[:5] - end_index = self.frame.index[5:] - - begin_frame = self.frame.reindex(begin_index) - end_frame = self.frame.reindex(end_index) - - appended = begin_frame.append(end_frame) - assert_almost_equal(appended['A'], self.frame['A']) - - del end_frame['A'] - partial_appended = begin_frame.append(end_frame) - self.assertIn('A', partial_appended) - - partial_appended = end_frame.append(begin_frame) - self.assertIn('A', partial_appended) - - # mixed type handling - appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) - assert_frame_equal(appended, self.mixed_frame) - - # what to test here - mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) - mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) - - # all equal except 'foo' column - assert_frame_equal( - mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), - mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) - - # append empty - empty = DataFrame({}) - - appended = self.frame.append(empty) - assert_frame_equal(self.frame, appended) - self.assertIsNot(appended, self.frame) - - appended = empty.append(self.frame) - assert_frame_equal(self.frame, appended) - self.assertIsNot(appended, self.frame) - - # overlap - self.assertRaises(ValueError, self.frame.append, self.frame, - verify_integrity=True) - - # new columns - # GH 6129 - df = DataFrame({'a': {'x': 1, 'y': 2}, 'b': {'x': 3, 'y': 4}}) - row = Series([5, 6, 7], index=['a', 'b', 'c'], name='z') - expected = DataFrame({'a': {'x': 1, 'y': 2, 'z': 5}, 'b': { - 'x': 3, 'y': 4, 'z': 6}, 'c': {'z': 7}}) - result = df.append(row) - assert_frame_equal(result, expected) - - def test_append_length0_frame(self): - df = DataFrame(columns=['A', 'B', 'C']) - df3 = DataFrame(index=[0, 1], columns=['A', 'B']) - df5 = df.append(df3) - - expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) - assert_frame_equal(df5, expected) - - def test_append_records(self): - arr1 = np.zeros((2,), dtype=('i4,f4,a10')) - arr1[:] = [(1, 2., 'Hello'), (2, 3., "World")] - - arr2 = np.zeros((3,), dtype=('i4,f4,a10')) - arr2[:] = [(3, 4., 'foo'), - (5, 6., "bar"), - (7., 8., 'baz')] - - df1 = DataFrame(arr1) - df2 = DataFrame(arr2) - - result = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate((arr1, arr2))) - assert_frame_equal(result, expected) - - def test_append_different_columns(self): - df = DataFrame({'bools': np.random.randn(10) > 0, - 'ints': np.random.randint(0, 10, 10), - 'floats': np.random.randn(10), - 'strings': ['foo', 'bar'] * 5}) - - a = df[:5].ix[:, ['bools', 'ints', 'floats']] - b = df[5:].ix[:, ['strings', 'ints', 'floats']] - - appended = a.append(b) - self.assertTrue(isnull(appended['strings'][0:4]).all()) - self.assertTrue(isnull(appended['bools'][5:]).all()) - - def test_append_many(self): - chunks = [self.frame[:5], self.frame[5:10], - self.frame[10:15], self.frame[15:]] - - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result, self.frame) - - chunks[-1] = chunks[-1].copy() - chunks[-1]['foo'] = 'bar' - result = chunks[0].append(chunks[1:]) - tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) - self.assertTrue((result['foo'][15:] == 'bar').all()) - self.assertTrue(result['foo'][:15].isnull().all()) - - def test_append_preserve_index_name(self): - # #980 - df1 = DataFrame(data=None, columns=['A', 'B', 'C']) - df1 = df1.set_index(['A']) - df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], - columns=['A', 'B', 'C']) - df2 = df2.set_index(['A']) - - result = df1.append(df2) - self.assertEqual(result.index.name, 'A') - - def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] - - joined = df_list[0].join(df_list[1:]) - tm.assert_frame_equal(joined, df) - - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] - - def _check_diff_index(df_list, result, exp_index): - reindexed = [x.reindex(exp_index) for x in df_list] - expected = reindexed[0].join(reindexed[1:]) - tm.assert_frame_equal(result, expected) - - # different join types - joined = df_list[0].join(df_list[1:], how='outer') - _check_diff_index(df_list, joined, df.index) - - joined = df_list[0].join(df_list[1:]) - _check_diff_index(df_list, joined, df_list[0].index) - - joined = df_list[0].join(df_list[1:], how='inner') - _check_diff_index(df_list, joined, df.index[2:8]) - - self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') - - def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - - result = df1.join([df2, df3]) - assert_frame_equal(result, df) - - def test_append_missing_column_proper_upcast(self): - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) - df2 = DataFrame({'B': np.array([True, False, True, False], - dtype=bool)}) - - appended = df1.append(df2, ignore_index=True) - self.assertEqual(appended['A'].dtype, 'f8') - self.assertEqual(appended['B'].dtype, 'O') - - def test_concat_copy(self): - - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randint(0, 10, size=4).reshape(4, 1)) - df3 = DataFrame({5: 'foo'}, index=range(4)) - - # these are actual copies - result = concat([df, df2, df3], axis=1, copy=True) - for b in result._data.blocks: - self.assertIsNone(b.values.base) - - # these are the same - result = concat([df, df2, df3], axis=1, copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertTrue( - b.values.base is df._data.blocks[0].values.base) - elif b.is_integer: - self.assertTrue( - b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) - - # float block was consolidated - df4 = DataFrame(np.random.randn(4, 1)) - result = concat([df, df2, df3, df4], axis=1, copy=False) - for b in result._data.blocks: - if b.is_float: - self.assertIsNone(b.values.base) - elif b.is_integer: - self.assertTrue( - b.values.base is df2._data.blocks[0].values.base) - elif b.is_object: - self.assertIsNotNone(b.values.base) - - def test_concat_with_group_keys(self): - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - # axis=0 - df = DataFrame(np.random.randn(3, 4)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1]) - exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], - [0, 1, 2, 0, 1, 2, 3]]) - expected = DataFrame(np.r_[df.values, df2.values], - index=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1]) - exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], - [0, 1, 2, 0, 1, 2]]) - expected = DataFrame(np.r_[df.values, df.values], - index=exp_index2) - tm.assert_frame_equal(result, expected) - - # axis=1 - df = DataFrame(np.random.randn(4, 3)) - df2 = DataFrame(np.random.randn(4, 4)) - - result = concat([df, df2], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df2.values], - columns=exp_index) - tm.assert_frame_equal(result, expected) - - result = concat([df, df], keys=[0, 1], axis=1) - expected = DataFrame(np.c_[df.values, df.values], - columns=exp_index2) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_specific_levels(self): - df = DataFrame(np.random.randn(10, 4)) - pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] - level = ['three', 'two', 'one', 'zero'] - result = concat(pieces, axis=1, keys=['one', 'two', 'three'], - levels=[level], - names=['group_key']) - - self.assert_numpy_array_equal(result.columns.levels[0], level) - self.assertEqual(result.columns.names[0], 'group_key') - - def test_concat_dataframe_keys_bug(self): - t1 = DataFrame({ - 'value': Series([1, 2, 3], index=Index(['a', 'b', 'c'], - name='id'))}) - t2 = DataFrame({ - 'value': Series([7, 8], index=Index(['a', 'b'], name='id'))}) - - # it works - result = concat([t1, t2], axis=1, keys=['t1', 't2']) - self.assertEqual(list(result.columns), [('t1', 'value'), - ('t2', 'value')]) - - def test_concat_series_partial_columns_names(self): - # GH10698 - foo = Series([1, 2], name='foo') - bar = Series([1, 2]) - baz = Series([4, 5]) - - result = concat([foo, bar, baz], axis=1) - expected = DataFrame({'foo': [1, 2], 0: [1, 2], 1: [ - 4, 5]}, columns=['foo', 0, 1]) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, keys=[ - 'red', 'blue', 'yellow']) - expected = DataFrame({'red': [1, 2], 'blue': [1, 2], 'yellow': [ - 4, 5]}, columns=['red', 'blue', 'yellow']) - tm.assert_frame_equal(result, expected) - - result = concat([foo, bar, baz], axis=1, ignore_index=True) - expected = DataFrame({0: [1, 2], 1: [1, 2], 2: [4, 5]}) - tm.assert_frame_equal(result, expected) - - def test_concat_dict(self): - frames = {'foo': DataFrame(np.random.randn(4, 3)), - 'bar': DataFrame(np.random.randn(4, 3)), - 'baz': DataFrame(np.random.randn(4, 3)), - 'qux': DataFrame(np.random.randn(4, 3))} - - sorted_keys = sorted(frames) - - result = concat(frames) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) - tm.assert_frame_equal(result, expected) - - result = concat(frames, axis=1) - expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, - axis=1) - tm.assert_frame_equal(result, expected) - - keys = ['baz', 'foo', 'bar'] - result = concat(frames, keys=keys) - expected = concat([frames[k] for k in keys], keys=keys) - tm.assert_frame_equal(result, expected) - - def test_concat_ignore_index(self): - frame1 = DataFrame({"test1": ["a", "b", "c"], - "test2": [1, 2, 3], - "test3": [4.5, 3.2, 1.2]}) - frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) - frame1.index = Index(["x", "y", "z"]) - frame2.index = Index(["x", "y", "q"]) - - v1 = concat([frame1, frame2], axis=1, ignore_index=True) - - nan = np.nan - expected = DataFrame([[nan, nan, nan, 4.3], - ['a', 1, 4.5, 5.2], - ['b', 2, 3.2, 2.2], - ['c', 3, 1.2, nan]], - index=Index(["q", "x", "y", "z"])) - - tm.assert_frame_equal(v1, expected) - - def test_concat_multiindex_with_keys(self): - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - frame = DataFrame(np.random.randn(10, 3), index=index, - columns=Index(['A', 'B', 'C'], name='exp')) - result = concat([frame, frame], keys=[0, 1], names=['iteration']) - - self.assertEqual(result.index.names, ('iteration',) + index.names) - tm.assert_frame_equal(result.ix[0], frame) - tm.assert_frame_equal(result.ix[1], frame) - self.assertEqual(result.index.nlevels, 3) - - def test_concat_multiindex_with_tz(self): - # GH 6606 - df = DataFrame({'dt': [datetime(2014, 1, 1), - datetime(2014, 1, 2), - datetime(2014, 1, 3)], - 'b': ['A', 'B', 'C'], - 'c': [1, 2, 3], 'd': [4, 5, 6]}) - df['dt'] = df['dt'].apply(lambda d: Timestamp(d, tz='US/Pacific')) - df = df.set_index(['dt', 'b']) - - exp_idx1 = DatetimeIndex(['2014-01-01', '2014-01-02', - '2014-01-03'] * 2, - tz='US/Pacific', name='dt') - exp_idx2 = Index(['A', 'B', 'C'] * 2, name='b') - exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) - expected = DataFrame({'c': [1, 2, 3] * 2, 'd': [4, 5, 6] * 2}, - index=exp_idx, columns=['c', 'd']) - - result = concat([df, df]) - tm.assert_frame_equal(result, expected) - - def test_concat_keys_and_levels(self): - df = DataFrame(np.random.randn(1, 3)) - df2 = DataFrame(np.random.randn(1, 4)) - - levels = [['foo', 'baz'], ['one', 'two']] - names = ['first', 'second'] - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels, - names=names) - expected = concat([df, df2, df, df2]) - exp_index = MultiIndex(levels=levels + [[0]], - labels=[[0, 0, 1, 1], [0, 1, 0, 1], - [0, 0, 0, 0]], - names=names + [None]) - expected.index = exp_index - - assert_frame_equal(result, expected) - - # no names - - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - levels=levels) - self.assertEqual(result.index.names, (None,) * 3) - - # no levels - result = concat([df, df2, df, df2], - keys=[('foo', 'one'), ('foo', 'two'), - ('baz', 'one'), ('baz', 'two')], - names=['first', 'second']) - self.assertEqual(result.index.names, ('first', 'second') + (None,)) - self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) - - def test_concat_keys_levels_no_overlap(self): - # GH #1406 - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - - self.assertRaises(ValueError, concat, [df, df], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) - - self.assertRaises(ValueError, concat, [df, df2], - keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) - - def test_concat_rename_index(self): - a = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_a')) - b = DataFrame(np.random.rand(3, 3), - columns=list('ABC'), - index=Index(list('abc'), name='index_b')) - - result = concat([a, b], keys=['key0', 'key1'], - names=['lvl0', 'lvl1']) - - exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) - names = list(exp.index.names) - names[1] = 'lvl1' - exp.index.set_names(names, inplace=True) - - tm.assert_frame_equal(result, exp) - self.assertEqual(result.index.names, exp.index.names) - - def test_crossed_dtypes_weird_corner(self): - columns = ['A', 'B', 'C', 'D'] - df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='f8'), - 'B': np.array([1, 2, 3, 4], dtype='i8'), - 'C': np.array([1, 2, 3, 4], dtype='f8'), - 'D': np.array([1, 2, 3, 4], dtype='i8')}, - columns=columns) - - df2 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8'), - 'B': np.array([1, 2, 3, 4], dtype='f8'), - 'C': np.array([1, 2, 3, 4], dtype='i8'), - 'D': np.array([1, 2, 3, 4], dtype='f8')}, - columns=columns) - - appended = df1.append(df2, ignore_index=True) - expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), - columns=columns) - tm.assert_frame_equal(appended, expected) - - df = DataFrame(np.random.randn(1, 3), index=['a']) - df2 = DataFrame(np.random.randn(1, 4), index=['b']) - result = concat( - [df, df2], keys=['one', 'two'], names=['first', 'second']) - self.assertEqual(result.index.names, ('first', 'second')) - - def test_dups_index(self): - # GH 4771 - - # single dtypes - df = DataFrame(np.random.randint(0, 10, size=40).reshape( - 10, 4), columns=['A', 'A', 'C', 'C']) - - result = concat([df, df], axis=1) - assert_frame_equal(result.iloc[:, :4], df) - assert_frame_equal(result.iloc[:, 4:], df) - - result = concat([df, df], axis=0) - assert_frame_equal(result.iloc[:10], df) - assert_frame_equal(result.iloc[10:], df) - - # multi dtypes - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - result = concat([df, df], axis=1) - assert_frame_equal(result.iloc[:, :6], df) - assert_frame_equal(result.iloc[:, 6:], df) - - result = concat([df, df], axis=0) - assert_frame_equal(result.iloc[:10], df) - assert_frame_equal(result.iloc[10:], df) - - # append - result = df.iloc[0:8, :].append(df.iloc[8:]) - assert_frame_equal(result, df) - - result = df.iloc[0:8, :].append(df.iloc[8:9]).append(df.iloc[9:10]) - assert_frame_equal(result, df) - - expected = concat([df, df], axis=0) - result = df.append(df) - assert_frame_equal(result, expected) - - def test_with_mixed_tuples(self): - # 10697 - # columns have mixed tuples, so handle properly - df1 = DataFrame({u'A': 'foo', (u'B', 1): 'bar'}, index=range(2)) - df2 = DataFrame({u'B': 'foo', (u'B', 1): 'bar'}, index=range(2)) - - # it works - concat([df1, df2]) - - def test_join_dups(self): - - # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') - result.columns = expected.columns - assert_frame_equal(result, expected) - - # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - - dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] - assert_frame_equal(dta, expected) - - def test_handle_empty_objects(self): - df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) - - baz = df[:5].copy() - baz['foo'] = 'bar' - empty = df[5:5] - - frames = [baz, empty, empty, df[5:]] - concatted = concat(frames, axis=0) - - expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] - expected['foo'] = expected['foo'].astype('O') - expected.loc[0:4, 'foo'] = 'bar' - - tm.assert_frame_equal(concatted, expected) - - # empty as first element with time series - # GH3259 - df = DataFrame(dict(A=range(10000)), index=date_range( - '20130101', periods=10000, freq='s')) - empty = DataFrame() - result = concat([df, empty], axis=1) - assert_frame_equal(result, df) - result = concat([empty, df], axis=1) - assert_frame_equal(result, df) - - result = concat([df, empty]) - assert_frame_equal(result, df) - result = concat([empty, df]) - assert_frame_equal(result, df) - - def test_concat_mixed_objs(self): - - # concat mixed series/frames - # G2385 - - # axis 1 - index = date_range('01-Jan-2013', periods=10, freq='H') - arr = np.arange(10, dtype='int64') - s1 = Series(arr, index=index) - s2 = Series(arr, index=index) - df = DataFrame(arr.reshape(-1, 1), index=index) - - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 0]) - result = concat([df, df], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 2).reshape(-1, 2), - index=index, columns=[0, 1]) - result = concat([s1, s2], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) - result = concat([s1, s2, s1], axis=1) - assert_frame_equal(result, expected) - - expected = DataFrame(np.repeat(arr, 5).reshape(-1, 5), - index=index, columns=[0, 0, 1, 2, 3]) - result = concat([s1, df, s2, s2, s1], axis=1) - assert_frame_equal(result, expected) - - # with names - s1.name = 'foo' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 0]) - result = concat([s1, df, s2], axis=1) - assert_frame_equal(result, expected) - - s2.name = 'bar' - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=['foo', 0, 'bar']) - result = concat([s1, df, s2], axis=1) - assert_frame_equal(result, expected) - - # ignore index - expected = DataFrame(np.repeat(arr, 3).reshape(-1, 3), - index=index, columns=[0, 1, 2]) - result = concat([s1, df, s2], axis=1, ignore_index=True) - assert_frame_equal(result, expected) - - # axis 0 - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), - index=index.tolist() * 3, columns=[0]) - result = concat([s1, df, s2]) - assert_frame_equal(result, expected) - - expected = DataFrame(np.tile(arr, 3).reshape(-1, 1), columns=[0]) - result = concat([s1, df, s2], ignore_index=True) - assert_frame_equal(result, expected) - - # invalid concatente of mixed dims - panel = tm.makePanel() - self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) - - def test_panel_join(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] - - # left join - result = p1.join(p2) - expected = p1.copy() - expected['ItemC'] = p2['ItemC'] - tm.assert_panel_equal(result, expected) - - # right join - result = p1.join(p2, how='right') - expected = p2.copy() - expected['ItemA'] = p1['ItemA'] - expected['ItemB'] = p1['ItemB'] - expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) - tm.assert_panel_equal(result, expected) - - # inner join - result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] - tm.assert_panel_equal(result, expected) - - # outer join - result = p1.join(p2, how='outer') - expected = p1.reindex(major=panel.major_axis, - minor=panel.minor_axis) - expected = expected.join(p2.reindex(major=panel.major_axis, - minor=panel.minor_axis)) - tm.assert_panel_equal(result, expected) - - def test_panel_join_overlap(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] - - # Expected index is - # - # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 - joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] - expected = no_overlap.join(p1_suf.join(p2_suf)) - tm.assert_panel_equal(joined, expected) - - def test_panel_join_many(self): - tm.K = 10 - panel = tm.makePanel() - tm.K = 4 - - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] - - joined = panels[0].join(panels[1:]) - tm.assert_panel_equal(joined, panel) - - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] - - data_dict = {} - for p in panels: - data_dict.update(p.iteritems()) - - joined = panels[0].join(panels[1:], how='inner') - expected = Panel.from_dict(data_dict, intersect=True) - tm.assert_panel_equal(joined, expected) - - joined = panels[0].join(panels[1:], how='outer') - expected = Panel.from_dict(data_dict, intersect=False) - tm.assert_panel_equal(joined, expected) - - # edge cases - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='outer', lsuffix='foo', rsuffix='bar') - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='right') - - def test_panel_concat_other_axes(self): - panel = tm.makePanel() - - p1 = panel.ix[:, :5, :] - p2 = panel.ix[:, 5:, :] - - result = concat([p1, p2], axis=1) - tm.assert_panel_equal(result, panel) - - p1 = panel.ix[:, :, :2] - p2 = panel.ix[:, :, 2:] - - result = concat([p1, p2], axis=2) - tm.assert_panel_equal(result, panel) - - # if things are a bit misbehaved - p1 = panel.ix[:2, :, :2] - p2 = panel.ix[:, :, 2:] - p1['ItemC'] = 'baz' - - result = concat([p1, p2], axis=2) - - expected = panel.copy() - expected['ItemC'] = expected['ItemC'].astype('O') - expected.ix['ItemC', :, :2] = 'baz' - tm.assert_panel_equal(result, expected) - - def test_panel_concat_buglet(self): - # #2257 - def make_panel(): - index = 5 - cols = 3 - - def df(): - return DataFrame(np.random.randn(index, cols), - index=["I%s" % i for i in range(index)], - columns=["C%s" % i for i in range(cols)]) - return Panel(dict([("Item%s" % x, df()) for x in ['A', 'B', 'C']])) - - panel1 = make_panel() - panel2 = make_panel() - - panel2 = panel2.rename_axis(dict([(x, "%s_1" % x) - for x in panel2.major_axis]), - axis=1) - - panel3 = panel2.rename_axis(lambda x: '%s_1' % x, axis=1) - panel3 = panel3.rename_axis(lambda x: '%s_1' % x, axis=2) - - # it works! - concat([panel1, panel3], axis=1, verify_integrity=True) - - def test_panel4d_concat(self): - p4d = tm.makePanel4D() - - p1 = p4d.ix[:, :, :5, :] - p2 = p4d.ix[:, :, 5:, :] - - result = concat([p1, p2], axis=2) - tm.assert_panel4d_equal(result, p4d) - - p1 = p4d.ix[:, :, :, :2] - p2 = p4d.ix[:, :, :, 2:] - - result = concat([p1, p2], axis=3) - tm.assert_panel4d_equal(result, p4d) - - def test_panel4d_concat_mixed_type(self): - p4d = tm.makePanel4D() - - # if things are a bit misbehaved - p1 = p4d.ix[:, :2, :, :2] - p2 = p4d.ix[:, :, :, 2:] - p1['L5'] = 'baz' - - result = concat([p1, p2], axis=3) - - p2['L5'] = np.nan - expected = concat([p1, p2], axis=3) - expected = expected.ix[result.labels] - - tm.assert_panel4d_equal(result, expected) - - def test_concat_series(self): - - ts = tm.makeTimeSeries() - ts.name = 'foo' - - pieces = [ts[:5], ts[5:15], ts[15:]] - - result = concat(pieces) - tm.assert_series_equal(result, ts) - self.assertEqual(result.name, ts.name) - - result = concat(pieces, keys=[0, 1, 2]) - expected = ts.copy() - - ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) - - exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), - np.arange(len(ts))] - exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], - labels=exp_labels) - expected.index = exp_index - tm.assert_series_equal(result, expected) - - def test_concat_series_axis1(self): - ts = tm.makeTimeSeries() - - pieces = [ts[:-2], ts[2:], ts[2:-2]] - - result = concat(pieces, axis=1) - expected = DataFrame(pieces).T - assert_frame_equal(result, expected) - - result = concat(pieces, keys=['A', 'B', 'C'], axis=1) - expected = DataFrame(pieces, index=['A', 'B', 'C']).T - assert_frame_equal(result, expected) - - # preserve series names, #2489 - s = Series(randn(5), name='A') - s2 = Series(randn(5), name='B') - - result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) - assert_frame_equal(result, expected) - - s2.name = None - result = concat([s, s2], axis=1) - self.assertTrue(np.array_equal( - result.columns, Index(['A', 0], dtype='object'))) - - # must reindex, #2603 - s = Series(randn(3), index=['c', 'a', 'b'], name='A') - s2 = Series(randn(4), index=['d', 'a', 'b', 'c'], name='B') - result = concat([s, s2], axis=1) - expected = DataFrame({'A': s, 'B': s2}) - assert_frame_equal(result, expected) - - def test_concat_single_with_key(self): - df = DataFrame(np.random.randn(10, 4)) - - result = concat([df], keys=['foo']) - expected = concat([df, df], keys=['foo', 'bar']) - tm.assert_frame_equal(result, expected[:10]) - - def test_concat_exclude_none(self): - df = DataFrame(np.random.randn(10, 4)) - - pieces = [df[:5], None, None, df[5:]] - result = concat(pieces) - tm.assert_frame_equal(result, df) - self.assertRaises(ValueError, concat, [None, None]) - - def test_concat_datetime64_block(self): - from pandas.tseries.index import date_range - - rng = date_range('1/1/2000', periods=10) - - df = DataFrame({'time': rng}) - - result = concat([df, df]) - self.assertTrue((result.iloc[:10]['time'] == rng).all()) - self.assertTrue((result.iloc[10:]['time'] == rng).all()) - - def test_concat_timedelta64_block(self): - from pandas import to_timedelta - - rng = to_timedelta(np.arange(10), unit='s') - - df = DataFrame({'time': rng}) - - result = concat([df, df]) - self.assertTrue((result.iloc[:10]['time'] == rng).all()) - self.assertTrue((result.iloc[10:]['time'] == rng).all()) - - def test_concat_keys_with_none(self): - # #1649 - df0 = DataFrame([[10, 20, 30], [10, 20, 30], [10, 20, 30]]) - - result = concat(dict(a=None, b=df0, c=df0[:2], d=df0[:1], e=df0)) - expected = concat(dict(b=df0, c=df0[:2], d=df0[:1], e=df0)) - tm.assert_frame_equal(result, expected) - - result = concat([None, df0, df0[:2], df0[:1], df0], - keys=['a', 'b', 'c', 'd', 'e']) - expected = concat([df0, df0[:2], df0[:1], df0], - keys=['b', 'c', 'd', 'e']) - tm.assert_frame_equal(result, expected) - - def test_concat_bug_1719(self): - ts1 = tm.makeTimeSeries() - ts2 = tm.makeTimeSeries()[::2] - - # to join with union - # these two are of different length! - left = concat([ts1, ts2], join='outer', axis=1) - right = concat([ts2, ts1], join='outer', axis=1) - - self.assertEqual(len(left), len(right)) - - def test_concat_bug_2972(self): - ts0 = Series(np.zeros(5)) - ts1 = Series(np.ones(5)) - ts0.name = ts1.name = 'same name' - result = concat([ts0, ts1], axis=1) - - expected = DataFrame({0: ts0, 1: ts1}) - expected.columns = ['same name', 'same name'] - assert_frame_equal(result, expected) - - def test_concat_bug_3602(self): - - # GH 3602, duplicate columns - df1 = DataFrame({'firmNo': [0, 0, 0, 0], 'stringvar': [ - 'rrr', 'rrr', 'rrr', 'rrr'], 'prc': [6, 6, 6, 6]}) - df2 = DataFrame({'misc': [1, 2, 3, 4], 'prc': [ - 6, 6, 6, 6], 'C': [9, 10, 11, 12]}) - expected = DataFrame([[0, 6, 'rrr', 9, 1, 6], - [0, 6, 'rrr', 10, 2, 6], - [0, 6, 'rrr', 11, 3, 6], - [0, 6, 'rrr', 12, 4, 6]]) - expected.columns = ['firmNo', 'prc', 'stringvar', 'C', 'misc', 'prc'] - - result = concat([df1, df2], axis=1) - assert_frame_equal(result, expected) - - def test_concat_series_axis1_same_names_ignore_index(self): - dates = date_range('01-Jan-2013', '01-Jan-2014', freq='MS')[0:-1] - s1 = Series(randn(len(dates)), index=dates, name='value') - s2 = Series(randn(len(dates)), index=dates, name='value') - - result = concat([s1, s2], axis=1, ignore_index=True) - self.assertTrue(np.array_equal(result.columns, [0, 1])) - - def test_concat_iterables(self): - from collections import deque, Iterable - - # GH8645 check concat works with tuples, list, generators, and weird - # stuff like deque and custom iterables - df1 = DataFrame([1, 2, 3]) - df2 = DataFrame([4, 5, 6]) - expected = DataFrame([1, 2, 3, 4, 5, 6]) - assert_frame_equal(concat((df1, df2), ignore_index=True), expected) - assert_frame_equal(concat([df1, df2], ignore_index=True), expected) - assert_frame_equal(concat((df for df in (df1, df2)), - ignore_index=True), expected) - assert_frame_equal( - concat(deque((df1, df2)), ignore_index=True), expected) - - class CustomIterator1(object): - - def __len__(self): - return 2 - - def __getitem__(self, index): - try: - return {0: df1, 1: df2}[index] - except KeyError: - raise IndexError - assert_frame_equal(pd.concat(CustomIterator1(), - ignore_index=True), expected) - - class CustomIterator2(Iterable): - - def __iter__(self): - yield df1 - yield df2 - assert_frame_equal(pd.concat(CustomIterator2(), - ignore_index=True), expected) - - def test_concat_invalid(self): - - # trying to concat a ndframe with a non-ndframe - df1 = mkdf(10, 2) - for obj in [1, dict(), [1, 2], (1, 2)]: - self.assertRaises(TypeError, lambda x: concat([df1, obj])) - - def test_concat_invalid_first_argument(self): - df1 = mkdf(10, 2) - df2 = mkdf(10, 2) - self.assertRaises(TypeError, concat, df1, df2) - - # generator ok though - concat(DataFrame(np.random.rand(5, 5)) for _ in range(3)) - - # text reader ok - # GH6583 - data = """index,A,B,C,D -foo,2,3,4,5 -bar,7,8,9,10 -baz,12,13,14,15 -qux,12,13,14,15 -foo2,12,13,14,15 -bar2,12,13,14,15 -""" - - reader = read_csv(StringIO(data), chunksize=1) - result = concat(reader, ignore_index=True) - expected = read_csv(StringIO(data)) - assert_frame_equal(result, expected) - - -class TestOrderedMerge(tm.TestCase): - - def setUp(self): - self.left = DataFrame({'key': ['a', 'c', 'e'], - 'lvalue': [1, 2., 3]}) - - self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], - 'rvalue': [1, 2, 3., 4]}) - - # GH #813 - - def test_basic(self): - result = ordered_merge(self.left, self.right, on='key') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1, nan, 2, nan, 3, nan], - 'rvalue': [nan, 1, 2, 3, nan, 4]}) - - assert_frame_equal(result, expected) - - def test_ffill(self): - result = ordered_merge( - self.left, self.right, on='key', fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], - 'lvalue': [1., 1, 2, 2, 3, 3.], - 'rvalue': [nan, 1, 2, 3, 3, 4]}) - assert_frame_equal(result, expected) - - def test_multigroup(self): - left = concat([self.left, self.left], ignore_index=True) - # right = concat([self.right, self.right], ignore_index=True) - - left['group'] = ['a'] * 3 + ['b'] * 3 - # right['group'] = ['a'] * 4 + ['b'] * 4 - - result = ordered_merge(left, self.right, on='key', left_by='group', - fill_method='ffill') - expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, - 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, - 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) - expected['group'] = ['a'] * 6 + ['b'] * 6 - - assert_frame_equal(result, expected.ix[:, result.columns]) - - result2 = ordered_merge(self.right, left, on='key', right_by='group', - fill_method='ffill') - assert_frame_equal(result, result2.ix[:, result.columns]) - - result = ordered_merge(left, self.right, on='key', left_by='group') - self.assertTrue(result['group'].notnull().all()) - - def test_merge_type(self): - class NotADataFrame(DataFrame): - - @property - def _constructor(self): - return NotADataFrame - - nad = NotADataFrame(self.left) - result = nad.merge(self.right, on='key') - - tm.assertIsInstance(result, NotADataFrame) - - def test_empty_sequence_concat(self): - # GH 9157 - empty_pat = "[Nn]o objects" - none_pat = "objects.*None" - test_cases = [ - ((), empty_pat), - ([], empty_pat), - ({}, empty_pat), - ([None], none_pat), - ([None, None], none_pat) - ] - for df_seq, pattern in test_cases: - assertRaisesRegexp(ValueError, pattern, pd.concat, df_seq) - - pd.concat([pd.DataFrame()]) - pd.concat([None, pd.DataFrame()]) - pd.concat([pd.DataFrame(), None]) - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/tests/test_ordered_merge.py b/pandas/tools/tests/test_ordered_merge.py new file mode 100644 index 0000000000000..53f00d9761f32 --- /dev/null +++ b/pandas/tools/tests/test_ordered_merge.py @@ -0,0 +1,93 @@ +import nose + +import pandas as pd +from pandas import DataFrame, ordered_merge +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + +from numpy import nan + + +class TestOrderedMerge(tm.TestCase): + + def setUp(self): + self.left = DataFrame({'key': ['a', 'c', 'e'], + 'lvalue': [1, 2., 3]}) + + self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], + 'rvalue': [1, 2, 3., 4]}) + + # GH #813 + + def test_basic(self): + result = ordered_merge(self.left, self.right, on='key') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1, nan, 2, nan, 3, nan], + 'rvalue': [nan, 1, 2, 3, nan, 4]}) + + assert_frame_equal(result, expected) + + def test_ffill(self): + result = ordered_merge( + self.left, self.right, on='key', fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1., 1, 2, 2, 3, 3.], + 'rvalue': [nan, 1, 2, 3, 3, 4]}) + assert_frame_equal(result, expected) + + def test_multigroup(self): + left = pd.concat([self.left, self.left], ignore_index=True) + # right = concat([self.right, self.right], ignore_index=True) + + left['group'] = ['a'] * 3 + ['b'] * 3 + # right['group'] = ['a'] * 4 + ['b'] * 4 + + result = ordered_merge(left, self.right, on='key', left_by='group', + fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, + 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, + 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) + expected['group'] = ['a'] * 6 + ['b'] * 6 + + assert_frame_equal(result, expected.ix[:, result.columns]) + + result2 = ordered_merge(self.right, left, on='key', right_by='group', + fill_method='ffill') + assert_frame_equal(result, result2.ix[:, result.columns]) + + result = ordered_merge(left, self.right, on='key', left_by='group') + self.assertTrue(result['group'].notnull().all()) + + def test_merge_type(self): + class NotADataFrame(DataFrame): + + @property + def _constructor(self): + return NotADataFrame + + nad = NotADataFrame(self.left) + result = nad.merge(self.right, on='key') + + tm.assertIsInstance(result, NotADataFrame) + + def test_empty_sequence_concat(self): + # GH 9157 + empty_pat = "[Nn]o objects" + none_pat = "objects.*None" + test_cases = [ + ((), empty_pat), + ([], empty_pat), + ({}, empty_pat), + ([None], none_pat), + ([None, None], none_pat) + ] + for df_seq, pattern in test_cases: + tm.assertRaisesRegexp(ValueError, pattern, pd.concat, df_seq) + + pd.concat([pd.DataFrame()]) + pd.concat([None, pd.DataFrame()]) + pd.concat([pd.DataFrame(), None]) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index eb18023d6409d..5cd7abb6889b7 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -249,7 +249,7 @@ def convert_to_pydatetime(x, axis): # thus no need to care # we require ALL of the same tz for datetimetz - tzs = set([x.tz for x in to_concat]) + tzs = set([str(x.tz) for x in to_concat]) if len(tzs) == 1: from pandas.tseries.index import DatetimeIndex new_values = np.concatenate([x.tz_localize(None).asi8 From 40b4bb4bb2a7018ed08025c5c93cd1080a0b5f7f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 26 May 2016 19:43:45 -0400 Subject: [PATCH 60/96] TST: reorg datetime with tz tests a bit Author: Jeff Reback Closes #13301 from jreback/ts2 and squashes the following commits: 2d8fbd4 [Jeff Reback] TST: reorg tests for datetime_with_tz construction --- pandas/tests/frame/test_constructors.py | 219 ++++-------------------- pandas/tests/frame/test_dtypes.py | 116 +++++++++++++ pandas/tests/frame/test_indexing.py | 61 +++++++ pandas/tools/tests/test_merge.py | 9 + 4 files changed, 223 insertions(+), 182 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1d043297aa1fa..6913df765862d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -17,7 +17,7 @@ from pandas.compat import (lmap, long, zip, range, lrange, lzip, OrderedDict, is_platform_little_endian) from pandas import compat -from pandas import (DataFrame, Index, Series, notnull, isnull, +from pandas import (DataFrame, Index, Series, isnull, MultiIndex, Timedelta, Timestamp, date_range) from pandas.core.common import PandasError @@ -25,8 +25,6 @@ import pandas.core.common as com import pandas.lib as lib -from pandas.types.api import DatetimeTZDtype - from pandas.util.testing import (assert_numpy_array_equal, assert_series_equal, assert_frame_equal, @@ -1329,185 +1327,6 @@ def test_constructor_with_datetimes(self): .reset_index(drop=True), 'b': i_no_tz}) assert_frame_equal(df, expected) - def test_constructor_with_datetime_tz(self): - - # 8260 - # support datetime64 with tz - - idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), - name='foo') - dr = date_range('20130110', periods=3) - - # construction - df = DataFrame({'A': idx, 'B': dr}) - self.assertTrue(df['A'].dtype, 'M8[ns, US/Eastern') - self.assertTrue(df['A'].name == 'A') - assert_series_equal(df['A'], Series(idx, name='A')) - assert_series_equal(df['B'], Series(dr, name='B')) - - # construction from dict - df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), - index=range(5)) - assert_series_equal(df2.dtypes, Series(['datetime64[ns, US/Eastern]', - 'datetime64[ns, CET]'], - index=['A', 'B'])) - - # dtypes - tzframe = DataFrame({'A': date_range('20130101', periods=3), - 'B': date_range('20130101', periods=3, - tz='US/Eastern'), - 'C': date_range('20130101', periods=3, tz='CET')}) - tzframe.iloc[1, 1] = pd.NaT - tzframe.iloc[1, 2] = pd.NaT - result = tzframe.dtypes.sort_index() - expected = Series([np.dtype('datetime64[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]'), - DatetimeTZDtype('datetime64[ns, CET]')], - ['A', 'B', 'C']) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - assert_frame_equal(df2, df3) - - # select_dtypes - result = df3.select_dtypes(include=['datetime64[ns]']) - expected = df3.reindex(columns=[]) - assert_frame_equal(result, expected) - - # this will select based on issubclass, and these are the same class - result = df3.select_dtypes(include=['datetime64[ns, CET]']) - expected = df3 - assert_frame_equal(result, expected) - - # from index - idx2 = date_range('20130101', periods=3, tz='US/Eastern', name='foo') - df2 = DataFrame(idx2) - assert_series_equal(df2['foo'], Series(idx2, name='foo')) - df2 = DataFrame(Series(idx2)) - assert_series_equal(df2['foo'], Series(idx2, name='foo')) - - idx2 = date_range('20130101', periods=3, tz='US/Eastern') - df2 = DataFrame(idx2) - assert_series_equal(df2[0], Series(idx2, name=0)) - df2 = DataFrame(Series(idx2)) - assert_series_equal(df2[0], Series(idx2, name=0)) - - # interleave with object - result = self.tzframe.assign(D='foo').values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', tz='CET')], - ['foo', 'foo', 'foo']], dtype=object).T - self.assert_numpy_array_equal(result, expected) - - # interleave with only datetime64[ns] - result = self.tzframe.values - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], dtype=object).T - self.assert_numpy_array_equal(result, expected) - - # astype - expected = np.array([[Timestamp('2013-01-01 00:00:00'), - Timestamp('2013-01-02 00:00:00'), - Timestamp('2013-01-03 00:00:00')], - [Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern'), - pd.NaT, - Timestamp('2013-01-03 00:00:00-0500', - tz='US/Eastern')], - [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), - pd.NaT, - Timestamp('2013-01-03 00:00:00+0100', - tz='CET')]], - dtype=object).T - result = self.tzframe.astype(object) - assert_frame_equal(result, DataFrame( - expected, index=self.tzframe.index, columns=self.tzframe.columns)) - - result = self.tzframe.astype('datetime64[ns]') - expected = DataFrame({'A': date_range('20130101', periods=3), - 'B': (date_range('20130101', periods=3, - tz='US/Eastern') - .tz_convert('UTC') - .tz_localize(None)), - 'C': (date_range('20130101', periods=3, - tz='CET') - .tz_convert('UTC') - .tz_localize(None))}) - expected.iloc[1, 1] = pd.NaT - expected.iloc[1, 2] = pd.NaT - assert_frame_equal(result, expected) - - # str formatting - result = self.tzframe.astype(str) - expected = np.array([['2013-01-01', '2013-01-01 00:00:00-05:00', - '2013-01-01 00:00:00+01:00'], - ['2013-01-02', 'NaT', 'NaT'], - ['2013-01-03', '2013-01-03 00:00:00-05:00', - '2013-01-03 00:00:00+01:00']], dtype=object) - self.assert_numpy_array_equal(result, expected) - - result = str(self.tzframe) - self.assertTrue('0 2013-01-01 2013-01-01 00:00:00-05:00 ' - '2013-01-01 00:00:00+01:00' in result) - self.assertTrue('1 2013-01-02 ' - 'NaT NaT' in result) - self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 ' - '2013-01-03 00:00:00+01:00' in result) - - # setitem - df['C'] = idx - assert_series_equal(df['C'], Series(idx, name='C')) - - df['D'] = 'foo' - df['D'] = idx - assert_series_equal(df['D'], Series(idx, name='D')) - del df['D'] - - # assert that A & C are not sharing the same base (e.g. they - # are copies) - b1 = df._data.blocks[1] - b2 = df._data.blocks[2] - self.assertTrue(b1.values.equals(b2.values)) - self.assertFalse(id(b1.values.values.base) == - id(b2.values.values.base)) - - # with nan - df2 = df.copy() - df2.iloc[1, 1] = pd.NaT - df2.iloc[1, 2] = pd.NaT - result = df2['B'] - assert_series_equal(notnull(result), Series( - [True, False, True], name='B')) - assert_series_equal(df2.dtypes, df.dtypes) - - # set/reset - df = DataFrame({'A': [0, 1, 2]}, index=idx) - result = df.reset_index() - self.assertTrue(result['foo'].dtype, 'M8[ns, US/Eastern') - - result = result.set_index('foo') - tm.assert_index_equal(df.index, idx) - def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused intname = np.dtype(np.int_).name # noqa @@ -2018,3 +1837,39 @@ def test_from_records_len0_with_columns(self): self.assertTrue(np.array_equal(result.columns, ['bar'])) self.assertEqual(len(result), 0) self.assertEqual(result.index.name, 'foo') + + +class TestDataFrameConstructorWithDatetimeTZ(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_from_dict(self): + + # 8260 + # support datetime64 with tz + + idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), + name='foo') + dr = date_range('20130110', periods=3) + + # construction + df = DataFrame({'A': idx, 'B': dr}) + self.assertTrue(df['A'].dtype, 'M8[ns, US/Eastern') + self.assertTrue(df['A'].name == 'A') + assert_series_equal(df['A'], Series(idx, name='A')) + assert_series_equal(df['B'], Series(dr, name='B')) + + def test_from_index(self): + + # from index + idx2 = date_range('20130101', periods=3, tz='US/Eastern', name='foo') + df2 = DataFrame(idx2) + assert_series_equal(df2['foo'], Series(idx2, name='foo')) + df2 = DataFrame(Series(idx2)) + assert_series_equal(df2['foo'], Series(idx2, name='foo')) + + idx2 = date_range('20130101', periods=3, tz='US/Eastern') + df2 = DataFrame(idx2) + assert_series_equal(df2[0], Series(idx2, name=0)) + df2 = DataFrame(Series(idx2)) + assert_series_equal(df2[0], Series(idx2, name=0)) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 97ca8238b78f9..064230bde791a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -9,6 +9,7 @@ from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, option_context) from pandas.compat import u +from pandas.core import common as com from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -74,6 +75,21 @@ def test_empty_frame_dtypes_ftypes(self): assert_series_equal(df[:0].dtypes, ex_dtypes) assert_series_equal(df[:0].ftypes, ex_ftypes) + def test_datetime_with_tz_dtypes(self): + tzframe = DataFrame({'A': date_range('20130101', periods=3), + 'B': date_range('20130101', periods=3, + tz='US/Eastern'), + 'C': date_range('20130101', periods=3, tz='CET')}) + tzframe.iloc[1, 1] = pd.NaT + tzframe.iloc[1, 2] = pd.NaT + result = tzframe.dtypes.sort_index() + expected = Series([np.dtype('datetime64[ns]'), + com.DatetimeTZDtype('datetime64[ns, US/Eastern]'), + com.DatetimeTZDtype('datetime64[ns, CET]')], + ['A', 'B', 'C']) + + assert_series_equal(result, expected) + def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) @@ -178,6 +194,16 @@ def test_select_dtypes_bad_datetime64(self): with tm.assertRaisesRegexp(ValueError, '.+ is too specific'): df.select_dtypes(exclude=['datetime64[as]']) + def test_select_dtypes_datetime_with_tz(self): + + df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130603', tz='CET')), + index=range(5)) + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + result = df3.select_dtypes(include=['datetime64[ns]']) + expected = df3.reindex(columns=[]) + assert_frame_equal(result, expected) + def test_select_dtypes_str_raises(self): df = DataFrame({'a': list('abc'), 'g': list(u('abc')), @@ -394,3 +420,93 @@ def test_timedeltas(self): 'int64': 1}).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) + + +class TestDataFrameDatetimeWithTZ(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_interleave(self): + + # interleave with object + result = self.tzframe.assign(D='foo').values + expected = np.array([[Timestamp('2013-01-01 00:00:00'), + Timestamp('2013-01-02 00:00:00'), + Timestamp('2013-01-03 00:00:00')], + [Timestamp('2013-01-01 00:00:00-0500', + tz='US/Eastern'), + pd.NaT, + Timestamp('2013-01-03 00:00:00-0500', + tz='US/Eastern')], + [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), + pd.NaT, + Timestamp('2013-01-03 00:00:00+0100', tz='CET')], + ['foo', 'foo', 'foo']], dtype=object).T + self.assert_numpy_array_equal(result, expected) + + # interleave with only datetime64[ns] + result = self.tzframe.values + expected = np.array([[Timestamp('2013-01-01 00:00:00'), + Timestamp('2013-01-02 00:00:00'), + Timestamp('2013-01-03 00:00:00')], + [Timestamp('2013-01-01 00:00:00-0500', + tz='US/Eastern'), + pd.NaT, + Timestamp('2013-01-03 00:00:00-0500', + tz='US/Eastern')], + [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), + pd.NaT, + Timestamp('2013-01-03 00:00:00+0100', + tz='CET')]], dtype=object).T + self.assert_numpy_array_equal(result, expected) + + def test_astype(self): + # astype + expected = np.array([[Timestamp('2013-01-01 00:00:00'), + Timestamp('2013-01-02 00:00:00'), + Timestamp('2013-01-03 00:00:00')], + [Timestamp('2013-01-01 00:00:00-0500', + tz='US/Eastern'), + pd.NaT, + Timestamp('2013-01-03 00:00:00-0500', + tz='US/Eastern')], + [Timestamp('2013-01-01 00:00:00+0100', tz='CET'), + pd.NaT, + Timestamp('2013-01-03 00:00:00+0100', + tz='CET')]], + dtype=object).T + result = self.tzframe.astype(object) + assert_frame_equal(result, DataFrame( + expected, index=self.tzframe.index, columns=self.tzframe.columns)) + + result = self.tzframe.astype('datetime64[ns]') + expected = DataFrame({'A': date_range('20130101', periods=3), + 'B': (date_range('20130101', periods=3, + tz='US/Eastern') + .tz_convert('UTC') + .tz_localize(None)), + 'C': (date_range('20130101', periods=3, + tz='CET') + .tz_convert('UTC') + .tz_localize(None))}) + expected.iloc[1, 1] = pd.NaT + expected.iloc[1, 2] = pd.NaT + assert_frame_equal(result, expected) + + def test_astype_str(self): + # str formatting + result = self.tzframe.astype(str) + expected = np.array([['2013-01-01', '2013-01-01 00:00:00-05:00', + '2013-01-01 00:00:00+01:00'], + ['2013-01-02', 'NaT', 'NaT'], + ['2013-01-03', '2013-01-03 00:00:00-05:00', + '2013-01-03 00:00:00+01:00']], dtype=object) + self.assert_numpy_array_equal(result, expected) + + result = str(self.tzframe) + self.assertTrue('0 2013-01-01 2013-01-01 00:00:00-05:00 ' + '2013-01-01 00:00:00+01:00' in result) + self.assertTrue('1 2013-01-02 ' + 'NaT NaT' in result) + self.assertTrue('2 2013-01-03 2013-01-03 00:00:00-05:00 ' + '2013-01-03 00:00:00+01:00' in result) diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index ca1ebe477e903..fc8456cb59840 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2699,3 +2699,64 @@ def test_type_error_multiindex(self): result = dg['x', 0] assert_series_equal(result, expected) + + +class TestDataFrameIndexingDatetimeWithTZ(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def setUp(self): + self.idx = Index(date_range('20130101', periods=3, tz='US/Eastern'), + name='foo') + self.dr = date_range('20130110', periods=3) + self.df = DataFrame({'A': self.idx, 'B': self.dr}) + + def test_setitem(self): + + df = self.df + idx = self.idx + + # setitem + df['C'] = idx + assert_series_equal(df['C'], Series(idx, name='C')) + + df['D'] = 'foo' + df['D'] = idx + assert_series_equal(df['D'], Series(idx, name='D')) + del df['D'] + + # assert that A & C are not sharing the same base (e.g. they + # are copies) + b1 = df._data.blocks[1] + b2 = df._data.blocks[2] + self.assertTrue(b1.values.equals(b2.values)) + self.assertFalse(id(b1.values.values.base) == + id(b2.values.values.base)) + + # with nan + df2 = df.copy() + df2.iloc[1, 1] = pd.NaT + df2.iloc[1, 2] = pd.NaT + result = df2['B'] + assert_series_equal(notnull(result), Series( + [True, False, True], name='B')) + assert_series_equal(df2.dtypes, df.dtypes) + + def test_set_reset(self): + + idx = self.idx + + # set/reset + df = DataFrame({'A': [0, 1, 2]}, index=idx) + result = df.reset_index() + self.assertTrue(result['foo'].dtype, 'M8[ns, US/Eastern') + + result = result.set_index('foo') + tm.assert_index_equal(df.index, idx) + + def test_transpose(self): + + result = self.df.T + expected = DataFrame(self.df.values.T) + expected.index = ['A', 'B'] + assert_frame_equal(result, expected) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 2f3a8f77af09b..e45ece28f5038 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1133,6 +1133,15 @@ def test_concat_NaT_series(self): result = pd.concat([x, y], ignore_index=True) tm.assert_series_equal(result, expected) + def test_concat_tz_frame(self): + df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130603', tz='CET')), + index=range(5)) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + assert_frame_equal(df2, df3) + def test_concat_tz_series(self): # GH 11755 # tz and no tz From 4b050552faec3b6cf8a82de1e7b2df2515765f55 Mon Sep 17 00:00:00 2001 From: Chris Date: Thu, 26 May 2016 19:55:12 -0400 Subject: [PATCH 61/96] DOC: low_memory in read_csv closes #5888, xref #12686 Author: Chris Closes #13293 from chris-b1/low-memory-doc and squashes the following commits: daf9bca [Chris] DOC: low_memory in read_csv --- doc/source/io.rst | 7 +++++++ pandas/io/parsers.py | 7 +++++++ pandas/tools/tests/test_merge.py | 4 ++-- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 104172d9574f1..6cf41bbc50fb5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -169,6 +169,13 @@ skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). nrows : int, default ``None`` Number of rows of file to read. Useful for reading pieces of large files. +low_memory : boolean, default ``True`` + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set ``False``, or specify the type with the ``dtype`` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. + (Only valid with C parser) NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 95a7f63075167..bf4083f61155c 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -220,6 +220,13 @@ warn_bad_lines : boolean, default True If error_bad_lines is False, and warn_bad_lines is True, a warning for each "bad line" will be output. (Only valid with C parser). +low_memory : boolean, default True + Internally process the file in chunks, resulting in lower memory use + while parsing, but possibly mixed type inference. To ensure no mixed + types either set False, or specify the type with the `dtype` parameter. + Note that the entire file is read into a single DataFrame regardless, + use the `chunksize` or `iterator` parameter to return the data in chunks. + (Only valid with C parser) Returns ------- diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index e45ece28f5038..01c651d496ecd 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -1134,8 +1134,8 @@ def test_concat_NaT_series(self): tm.assert_series_equal(result, expected) def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), - B=Timestamp('20130603', tz='CET')), + df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), + B=pd.Timestamp('20130603', tz='CET')), index=range(5)) # concat From 0f1666d8adfa8e121a935309b1d7ca6effec813c Mon Sep 17 00:00:00 2001 From: Camilo Cota Date: Thu, 26 May 2016 20:10:25 -0400 Subject: [PATCH 62/96] ENH: support decimal argument in read_html #12907 closes #12907 Author: Camilo Cota Author: Camilo Cota Closes #13272 from camilocot/issue-12907 and squashes the following commits: 0c15e37 [Camilo Cota] Remove bytes in decimal default value 111625f [Camilo Cota] ENH: support decimal argument in read_html #12907 --- doc/source/whatsnew/v0.18.2.txt | 13 +++++++------ pandas/io/html.py | 24 ++++++++++++++++++------ pandas/io/tests/test_html.py | 24 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index eeee85de4b5b6..0e4d9780cb2d4 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -66,7 +66,7 @@ Other enhancements idx = pd.Index(["a1a2", "b1", "c1"]) idx.str.extractall("[ab](?P\d)") -- ``Timestamp``s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) +- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) .. ipython:: python @@ -80,6 +80,7 @@ Other enhancements - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) .. _whatsnew_0182.api: @@ -121,10 +122,10 @@ New Behavior: .. _whatsnew_0182.api.promote: -``Series`` type promotoion on assignment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``Series`` type promotion on assignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A ``Series`` will now correctly promote its dtype with assignment with incompat values to the current dtype (:issue:`13234`) +A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) .. ipython:: python @@ -213,7 +214,7 @@ Bug Fixes - Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) -- Bug in ``.to_records()`` when index name is a unicode string (:issue: `13172`) +- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) - Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) @@ -238,7 +239,7 @@ Bug Fixes -- Bug in ``pd.read_csv()`` with ``engine=='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) diff --git a/pandas/io/html.py b/pandas/io/html.py index e350a40bfa805..48caaa39dd711 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -612,7 +612,8 @@ def _expand_elements(body): def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands): + parse_dates, tupleize_cols, thousands, + decimal): head, body, foot = data if head: @@ -630,7 +631,7 @@ def _data_to_frame(data, header, index_col, skiprows, tp = TextParser(body, header=header, index_col=index_col, skiprows=_get_skiprows(skiprows), parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands) + thousands=thousands, decimal=decimal) df = tp.read() return df @@ -716,7 +717,8 @@ def _validate_flavor(flavor): def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding): + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -744,7 +746,9 @@ def _parse(flavor, io, match, header, index_col, skiprows, skiprows=skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands)) + thousands=thousands, + decimal=decimal + )) except EmptyDataError: # empty table continue return ret @@ -752,7 +756,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, - tupleize_cols=False, thousands=',', encoding=None): + tupleize_cols=False, thousands=',', encoding=None, + decimal='.'): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -828,6 +833,12 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, underlying parser library (e.g., the parser library will try to use the encoding provided by the document). + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + .. versionadded:: 0.18.2 + Returns ------- dfs : list of DataFrames @@ -871,4 +882,5 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, 'data (you passed a negative value)') _validate_header_arg(header) return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding) + parse_dates, tupleize_cols, thousands, attrs, encoding, + decimal) diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 9b68267a0a0a8..b056f34b5f00e 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -664,6 +664,30 @@ def test_wikipedia_states_table(self): result = self.read_html(data, 'Arizona', header=1)[0] self.assertEqual(result['sq mi'].dtype, np.dtype('float64')) + def test_decimal_rows(self): + + # GH 12907 + data = StringIO(''' + + + + + + + + + + + + +
Header
1100#101
+ + ''') + expected = DataFrame(data={'Header': 1100.101}, index=[0]) + result = self.read_html(data, decimal='#')[0] + nose.tools.assert_equal(result['Header'].dtype, np.dtype('float64')) + tm.assert_frame_equal(result, expected) + def test_bool_header_arg(self): # GH 6114 for arg in [True, False]: From e8d9e79fc7d0a31e8c37c82f1e48d51cce59e9e0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 27 May 2016 08:40:29 -0400 Subject: [PATCH 63/96] BUG: preserve join keys dtype - closes #8596, preserve join keys dtype - adds ``Index.where`` method for all Index types (like ``np.where/Series.where``), but preserves dtypes Author: Jeff Reback Author: Mike Kelly Closes #13170 from jreback/merge2 and squashes the following commits: 0a267cf [Jeff Reback] BUG: preserve merge keys dtypes when possible 4173dbf [Mike Kelly] Preserve dtype in merge keys when possible --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.18.2.txt | 58 +++++++++++++- pandas/indexes/base.py | 18 +++++ pandas/indexes/category.py | 23 ++++++ pandas/indexes/multi.py | 4 + pandas/tests/indexes/common.py | 14 +++- pandas/tests/indexes/test_category.py | 15 +++- pandas/tests/indexes/test_datetimelike.py | 67 +++++++++++++++- pandas/tests/indexes/test_multi.py | 8 ++ pandas/tests/types/test_types.py | 40 ++++++++++ pandas/tools/merge.py | 85 +++++++++++++------- pandas/tools/tests/test_merge.py | 96 ++++++++++++++++------- pandas/tseries/base.py | 75 +++++++++++++++--- pandas/types/api.py | 35 ++++++++- 14 files changed, 464 insertions(+), 75 deletions(-) create mode 100644 pandas/tests/types/test_types.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 9557867c252ed..9e7ae2357c541 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1333,6 +1333,7 @@ Modifying and Computations Index.max Index.reindex Index.repeat + Index.where Index.take Index.putmask Index.set_names diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 0e4d9780cb2d4..dfb5ebc9379b1 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -77,11 +77,20 @@ Other enhancements - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) +- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + + .. ipython:: python + + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) + - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) + + .. _whatsnew_0182.api: API changes @@ -119,7 +128,6 @@ New Behavior: type(s.tolist()[0]) - .. _whatsnew_0182.api.promote: ``Series`` type promotion on assignment @@ -171,6 +179,54 @@ This will now convert integers/floats with the default unit of ``ns``. pd.to_datetime([1, 'foo'], errors='coerce') +.. _whatsnew_0182.api.merging: + +Merging changes +^^^^^^^^^^^^^^^ + +Merging will now preserve the dtype of the join keys (:issue:`8596`) + +.. ipython:: python + + df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 + df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 + +Previous Behavior: + +.. code-block:: ipython + + In [5]: pd.merge(df1, df2, how='outer') + Out[5]: + key v1 + 0 1.0 10.0 + 1 1.0 20.0 + 2 2.0 30.0 + + In [6]: pd.merge(df1, df2, how='outer').dtypes + Out[6]: + key float64 + v1 float64 + dtype: object + +New Behavior: + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(df1, df2, how='outer') + pd.merge(df1, df2, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast (unchanged from previous). + +.. ipython:: python + + pd.merge(df1, df2, how='outer', on='key') + pd.merge(df1, df2, how='outer', on='key').dtypes + .. _whatsnew_0182.api.other: Other API changes diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c029a4a74d9d0..82f16becbd511 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -465,6 +465,24 @@ def repeat(self, n, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(n)) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy_with_infer(values, dtype=self.dtype) + def ravel(self, order='C'): """ return an ndarray of the flattened values of the underlying data diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 8f343c5de5fb6..e877e43bcc603 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -307,6 +307,29 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + + from pandas.core.categorical import Categorical + cat = Categorical(values, + categories=self.categories, + ordered=self.ordered) + return self._shallow_copy(cat, **self._get_attributes_dict()) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 9f71f9f17d835..05b2045a4850f 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1084,6 +1084,10 @@ def repeat(self, n, *args, **kwargs): for label in self.labels], names=self.names, sortorder=self.sortorder, verify_integrity=False) + def where(self, cond, other=None): + raise NotImplementedError(".where is not supported for " + "MultiIndex operations") + def drop(self, labels, level=None, errors='raise'): """ Make new MultiIndex with passed list of labels deleted diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8ea87e9d69c92..0002bd840def3 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex) + TimedeltaIndex, PeriodIndex, notnull) from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -363,6 +363,18 @@ def test_numpy_repeat(self): tm.assertRaisesRegexp(ValueError, msg, np.repeat, i, rep, axis=0) + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.Index([np.nan, np.nan] + i[2:].tolist()) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_setops_errorcases(self): for name, idx in compat.iteritems(self.indices): # # non-iterable input diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 66ddcdebff83b..7fff62b822e40 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, compat +from pandas import Categorical, compat, notnull from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -230,6 +230,19 @@ def f(x): ordered=False) tm.assert_categorical_equal(result, exp) + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.CategoricalIndex([np.nan, np.nan] + i[2:].tolist(), + categories=i.categories) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_append(self): ci = self.create_index() diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index bd3deb8e6ed36..b3b987ceb6ab6 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -7,7 +7,7 @@ from pandas import (DatetimeIndex, Float64Index, Index, Int64Index, NaT, Period, PeriodIndex, Series, Timedelta, TimedeltaIndex, date_range, period_range, - timedelta_range) + timedelta_range, notnull) import pandas.util.testing as tm @@ -449,6 +449,38 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, 'datetime64') self.assertRaises(ValueError, idx.astype, 'datetime64[D]') + def test_where_other(self): + + # other is ndarray or Index + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + + def test_where_tz(self): + i = pd.date_range('20130101', periods=3, tz='US/Eastern') + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + def test_get_loc(self): idx = pd.date_range('2000-01-01', periods=3) @@ -776,6 +808,39 @@ def test_get_loc(self): with tm.assertRaises(KeyError): idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + def test_where(self): + i = self.create_index() + result = i.where(notnull(i)) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2)) + expected = i2 + tm.assert_index_equal(result, expected) + + def test_where_other(self): + + i = self.create_index() + for arr in [np.nan, pd.NaT]: + result = i.where(notnull(i), other=np.nan) + expected = i + tm.assert_index_equal(result, expected) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2) + tm.assert_index_equal(result, i2) + + i2 = i.copy() + i2 = pd.PeriodIndex([pd.NaT, pd.NaT] + i[2:].tolist(), + freq='D') + result = i.where(notnull(i2), i2.values) + tm.assert_index_equal(result, i2) + def test_get_indexer(self): idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') tm.assert_numpy_array_equal(idx.get_indexer(idx), [0, 1, 2]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index b8804daa6cf19..10d87abf0d886 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -78,6 +78,14 @@ def test_labels_dtypes(self): self.assertTrue((i.labels[0] >= 0).all()) self.assertTrue((i.labels[1] >= 0).all()) + def test_where(self): + i = MultiIndex.from_tuples([('A', 1), ('A', 2)]) + + def f(): + i.where(True) + + self.assertRaises(NotImplementedError, f) + def test_repeat(self): reps = 2 numbers = [1, 2, 3] diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py new file mode 100644 index 0000000000000..b9f6006cab731 --- /dev/null +++ b/pandas/tests/types/test_types.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +import nose +import numpy as np + +from pandas import NaT +from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, + na_value_for_dtype, pandas_dtype) + + +def test_pandas_dtype(): + + assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( + 'datetime64[ns, US/Eastern]') + assert pandas_dtype('category') == CategoricalDtype() + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + assert pandas_dtype(dtype) == np.dtype(dtype) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 3371f63db1e1c..182c0637ae29c 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -7,6 +7,7 @@ import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat + from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame, _merge_doc from pandas.core.generic import NDFrame @@ -22,6 +23,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat +from pandas.types.api import na_value_for_dtype import pandas.algos as _algos import pandas.hashtable as _hash @@ -280,55 +282,78 @@ def _indicator_post_merge(self, result): return result def _maybe_add_join_keys(self, result, left_indexer, right_indexer): - # insert group keys + + left_has_missing = None + right_has_missing = None keys = zip(self.join_names, self.left_on, self.right_on) for i, (name, lname, rname) in enumerate(keys): if not _should_fill(lname, rname): continue + take_left, take_right = None, None + if name in result: - key_indexer = result.columns.get_loc(name) if left_indexer is not None and right_indexer is not None: - if name in self.left: - if len(self.left) == 0: - continue - na_indexer = (left_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue + if left_has_missing is None: + left_has_missing = any(left_indexer == -1) + + if left_has_missing: + take_right = self.right_join_keys[i] + + if not com.is_dtype_equal(result[name].dtype, + self.left[name].dtype): + take_left = self.left[name]._values - right_na_indexer = right_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.right_join_keys[i], - right_na_indexer)) elif name in self.right: - if len(self.right) == 0: - continue - na_indexer = (right_indexer == -1).nonzero()[0] - if len(na_indexer) == 0: - continue + if right_has_missing is None: + right_has_missing = any(right_indexer == -1) + + if right_has_missing: + take_left = self.left_join_keys[i] + + if not com.is_dtype_equal(result[name].dtype, + self.right[name].dtype): + take_right = self.right[name]._values - left_na_indexer = left_indexer.take(na_indexer) - result.iloc[na_indexer, key_indexer] = ( - algos.take_1d(self.left_join_keys[i], - left_na_indexer)) elif left_indexer is not None \ and isinstance(self.left_join_keys[i], np.ndarray): - if name is None: - name = 'key_%d' % i + take_left = self.left_join_keys[i] + take_right = self.right_join_keys[i] - # a faster way? - key_col = algos.take_1d(self.left_join_keys[i], left_indexer) - na_indexer = (left_indexer == -1).nonzero()[0] - right_na_indexer = right_indexer.take(na_indexer) - key_col.put(na_indexer, algos.take_1d(self.right_join_keys[i], - right_na_indexer)) - result.insert(i, name, key_col) + if take_left is not None or take_right is not None: + + if take_left is None: + lvals = result[name]._values + else: + lfill = na_value_for_dtype(take_left.dtype) + lvals = algos.take_1d(take_left, left_indexer, + fill_value=lfill) + + if take_right is None: + rvals = result[name]._values + else: + rfill = na_value_for_dtype(take_right.dtype) + rvals = algos.take_1d(take_right, right_indexer, + fill_value=rfill) + + # if we have an all missing left_indexer + # make sure to just use the right values + mask = left_indexer == -1 + if mask.all(): + key_col = rvals + else: + key_col = Index(lvals).where(~mask, rvals) + + if name in result: + result[name] = key_col + else: + result.insert(i, name or 'key_%d' % i, key_col) def _get_join_info(self): left_ax = self.left._data.axes[self.axis] diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 01c651d496ecd..0b934d5f02b15 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -506,11 +506,10 @@ def test_join_many_non_unique_index(self): expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() - - result['a'] = result['a'].astype(np.float64) - result['b'] = result['b'].astype(np.float64) - - assert_frame_equal(result, expected.ix[:, result.columns]) + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) @@ -674,14 +673,35 @@ def test_intelligently_handle_join_key(self): 'rvalue': lrange(6)}) joined = merge(left, right, on='key', how='outer') - expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5.], + expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], 'value': np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, columns=['value', 'key', 'rvalue']) - assert_frame_equal(joined, expected, check_dtype=False) + assert_frame_equal(joined, expected) + + def test_merge_join_key_dtype_cast(self): + # #8596 + + df1 = DataFrame({'key': [1], 'v1': [10]}) + df2 = DataFrame({'key': [2], 'v1': [20]}) + df = merge(df1, df2, how='outer') + self.assertEqual(df['key'].dtype, 'int64') - self.assertTrue(joined._data.is_consolidated()) + df1 = DataFrame({'key': [True], 'v1': [1]}) + df2 = DataFrame({'key': [False], 'v1': [0]}) + df = merge(df1, df2, how='outer') + + # GH13169 + # this really should be bool + self.assertEqual(df['key'].dtype, 'object') + + df1 = DataFrame({'val': [1]}) + df2 = DataFrame({'val': [2]}) + lkey = np.array([1]) + rkey = np.array([2]) + df = merge(df1, df2, left_on=lkey, right_on=rkey, how='outer') + self.assertEqual(df['key_0'].dtype, 'int64') def test_handle_join_key_pass_array(self): left = DataFrame({'key': [1, 1, 2, 2, 3], @@ -814,20 +834,32 @@ def test_merge_left_empty_right_notempty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_index=True) + check1(exp_in, kwarg) + exp_out['a'] = [0, 1, 2] + check2(exp_out, kwarg) + + kwarg = dict(left_on='a', right_on='x') + check1(exp_in, kwarg) + exp_out['a'] = np.array([np.nan] * 3, dtype=object) + check2(exp_out, kwarg) def test_merge_left_notempty_right_empty(self): # GH 10824 @@ -846,20 +878,24 @@ def test_merge_left_notempty_right_empty(self): # result will have object dtype exp_in.index = exp_in.index.astype(object) - for kwarg in [dict(left_index=True, right_index=True), - dict(left_index=True, right_on='x'), - dict(left_on='a', right_index=True), - dict(left_on='a', right_on='x')]: - + def check1(exp, kwarg): result = pd.merge(left, right, how='inner', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='right', **kwarg) - tm.assert_frame_equal(result, exp_in) + tm.assert_frame_equal(result, exp) + def check2(exp, kwarg): result = pd.merge(left, right, how='left', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) result = pd.merge(left, right, how='outer', **kwarg) - tm.assert_frame_equal(result, exp_out) + tm.assert_frame_equal(result, exp) + + for kwarg in [dict(left_index=True, right_index=True), + dict(left_index=True, right_on='x'), + dict(left_on='a', right_index=True), + dict(left_on='a', right_on='x')]: + check1(exp_in, kwarg) + check2(exp_out, kwarg) def test_merge_nosort(self): # #2098, anything to do? @@ -1061,7 +1097,7 @@ def test_merge_on_datetime64tz(self): tz='US/Eastern')) + [pd.NaT], 'value_y': [pd.NaT] + list(pd.date_range('20151011', periods=2, tz='US/Eastern')), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'datetime64[ns, US/Eastern]') @@ -1093,7 +1129,7 @@ def test_merge_on_periods(self): exp_y = pd.period_range('20151011', periods=2, freq='D') expected = DataFrame({'value_x': list(exp_x) + [pd.NaT], 'value_y': [pd.NaT] + list(exp_y), - 'key': [1., 2, 3]}) + 'key': [1, 2, 3]}) result = pd.merge(left, right, on='key', how='outer') assert_frame_equal(result, expected) self.assertEqual(result['value_x'].dtype, 'object') @@ -1335,7 +1371,7 @@ def test_indicator(self): 'col_conflict_x': [1, 2, np.nan, np.nan, np.nan, np.nan], 'col_left': ['a', 'b', np.nan, np.nan, np.nan, np.nan], 'col_conflict_y': [np.nan, 1, 2, 3, 4, 5], - 'col_right': [np.nan, 2, 2, 2, 2, 2]}, dtype='float64') + 'col_right': [np.nan, 2, 2, 2, 2, 2]}) df_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only', 'right_only', 'right_only'], @@ -1414,7 +1450,7 @@ def test_indicator(self): df4 = DataFrame({'col1': [1, 1, 3], 'col2': ['b', 'x', 'y']}) - hand_coded_result = DataFrame({'col1': [0, 1, 1, 3.0], + hand_coded_result = DataFrame({'col1': [0, 1, 1, 3], 'col2': ['a', 'b', 'x', 'y']}) hand_coded_result['_merge'] = Categorical( ['left_only', 'both', 'right_only', 'right_only'], diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index e52afa74d95e2..42631d442a990 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -9,6 +9,7 @@ from pandas.compat.numpy import function as nv import numpy as np + from pandas.core import common as com, algorithms from pandas.core.common import (is_integer, is_float, is_bool_dtype, AbstractMethodError) @@ -74,22 +75,16 @@ def _round(self, freq, rounder): unit = to_offset(freq).nanos # round the local times - if getattr(self, 'tz', None) is not None: - values = self.tz_localize(None).asi8 - else: - values = self.asi8 + values = _ensure_datetimelike_to_i8(self) + result = (unit * rounder(values / float(unit))).astype('i8') attribs = self._get_attributes_dict() if 'freq' in attribs: attribs['freq'] = None if 'tz' in attribs: attribs['tz'] = None - result = self._shallow_copy(result, **attribs) - - # reconvert to local tz - if getattr(self, 'tz', None) is not None: - result = result.tz_localize(self.tz) - return result + return self._ensure_localized( + self._shallow_copy(result, **attribs)) @Appender(_round_doc % "round") def round(self, freq, *args, **kwargs): @@ -161,6 +156,29 @@ def _evaluate_compare(self, other, op): except TypeError: return result + def _ensure_localized(self, result): + """ + ensure that we are re-localized + + This is for compat as we can then call this on all datetimelike + indexes generally (ignored for Period/Timedelta) + + Parameters + ---------- + result : DatetimeIndex / i8 ndarray + + Returns + ------- + localized DTI + """ + + # reconvert to local tz + if getattr(self, 'tz', None) is not None: + if not isinstance(result, com.ABCIndexClass): + result = self._simple_new(result) + result = result.tz_localize(self.tz) + return result + @property def _box_func(self): """ @@ -727,6 +745,27 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self.values.repeat(repeats), freq=None) + def where(self, cond, other=None): + """ + .. versionadded:: 0.18.2 + + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. + + Parameters + ---------- + cond : boolean same length as self + other : scalar, or array-like + """ + other = _ensure_datetimelike_to_i8(other) + values = _ensure_datetimelike_to_i8(self) + result = np.where(cond, values, other).astype('i8') + + result = self._ensure_localized(result) + return self._shallow_copy(result, + **self._get_attributes_dict()) + def summary(self, name=None): """ return a summarized representation @@ -748,3 +787,19 @@ def summary(self, name=None): # display as values, not quoted result = result.replace("'", "") return result + + +def _ensure_datetimelike_to_i8(other): + """ helper for coercing an input scalar or array to i8 """ + if lib.isscalar(other) and com.isnull(other): + other = tslib.iNaT + elif isinstance(other, com.ABCIndexClass): + + # convert tz if needed + if getattr(other, 'tz', None) is not None: + other = other.tz_localize(None).asi8 + else: + other = other.asi8 + else: + other = np.array(other, copy=False).view('i8') + return other diff --git a/pandas/types/api.py b/pandas/types/api.py index bb61025a41a37..721d8d29bba8b 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -28,7 +28,11 @@ def pandas_dtype(dtype): ------- np.dtype or a pandas dtype """ - if isinstance(dtype, string_types): + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) except TypeError: @@ -40,3 +44,32 @@ def pandas_dtype(dtype): pass return np.dtype(dtype) + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + dtype compat na value + """ + + from pandas.core import common as com + from pandas import NaT + dtype = pandas_dtype(dtype) + + if (com.is_datetime64_dtype(dtype) or + com.is_datetime64tz_dtype(dtype) or + com.is_timedelta64_dtype(dtype)): + return NaT + elif com.is_float_dtype(dtype): + return np.nan + elif com.is_integer_dtype(dtype): + return 0 + elif com.is_bool_dtype(dtype): + return False + return np.nan From ae2ca83d31ebc2f95bb4f71b504f454ec399ee41 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 27 May 2016 09:37:49 -0400 Subject: [PATCH 64/96] COMPAT: windows test compat for merge, xref #13170 --- pandas/tools/tests/test_merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 0b934d5f02b15..efbe4c17ea544 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -676,7 +676,7 @@ def test_intelligently_handle_join_key(self): expected = DataFrame({'key': [1, 1, 1, 1, 2, 2, 3, 4, 5], 'value': np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), - 'rvalue': np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, + 'rvalue': [0, 1, 0, 1, 2, 2, 3, 4, 5]}, columns=['value', 'key', 'rvalue']) assert_frame_equal(joined, expected) From c2ea8fb20611370c7db7e11506727b0b67a76662 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 28 May 2016 13:25:28 -0400 Subject: [PATCH 65/96] TST: Make numpy_array test strict assert_numpy_array_equal now checks input is np.ndarray assert_almost_equal now checks inputs are the same class. Author: sinhrks Closes #13311 from sinhrks/test_ndarray and squashes the following commits: 7ae4e93 [sinhrks] TST: Make numpy_array test strict --- pandas/computation/tests/test_eval.py | 16 +- pandas/core/common.py | 4 +- pandas/io/tests/json/test_pandas.py | 17 +- pandas/io/tests/json/test_ujson.py | 86 ++-- pandas/io/tests/parser/comment.py | 32 +- pandas/io/tests/parser/common.py | 56 +- pandas/io/tests/parser/header.py | 26 +- pandas/io/tests/parser/na_values.py | 17 +- pandas/io/tests/parser/python_parser_only.py | 3 +- pandas/io/tests/parser/test_read_fwf.py | 8 +- pandas/io/tests/parser/test_textreader.py | 20 +- pandas/io/tests/test_html.py | 2 +- pandas/io/tests/test_packers.py | 28 +- pandas/io/tests/test_pickle.py | 2 +- pandas/io/tests/test_pytables.py | 2 +- pandas/sparse/tests/test_frame.py | 13 +- pandas/sparse/tests/test_libsparse.py | 9 +- pandas/sparse/tests/test_panel.py | 3 +- pandas/sparse/tests/test_series.py | 15 +- pandas/stats/tests/test_fama_macbeth.py | 4 +- pandas/stats/tests/test_ols.py | 123 +++-- pandas/tests/frame/test_alter_axes.py | 50 +- pandas/tests/frame/test_analytics.py | 484 +++++++++--------- .../tests/frame/test_axis_select_reindex.py | 33 +- pandas/tests/frame/test_block_internals.py | 4 +- pandas/tests/frame/test_constructors.py | 339 ++++++------ pandas/tests/frame/test_convert_to.py | 43 +- pandas/tests/frame/test_dtypes.py | 13 +- pandas/tests/frame/test_indexing.py | 22 +- pandas/tests/frame/test_misc_api.py | 18 +- pandas/tests/frame/test_missing.py | 10 +- pandas/tests/frame/test_mutate_columns.py | 8 +- pandas/tests/frame/test_operators.py | 4 +- pandas/tests/frame/test_reshape.py | 2 +- pandas/tests/frame/test_to_csv.py | 2 +- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/test_base.py | 129 +++-- pandas/tests/indexes/test_category.py | 46 +- pandas/tests/indexes/test_datetimelike.py | 79 +-- pandas/tests/indexes/test_multi.py | 59 ++- pandas/tests/indexes/test_numeric.py | 93 ++-- pandas/tests/indexes/test_range.py | 151 +++--- pandas/tests/indexing/test_floats.py | 6 +- pandas/tests/indexing/test_indexing.py | 4 +- pandas/tests/series/test_alter_axes.py | 4 +- pandas/tests/series/test_analytics.py | 31 +- pandas/tests/series/test_apply.py | 2 +- pandas/tests/series/test_combine_concat.py | 6 +- pandas/tests/series/test_constructors.py | 8 +- pandas/tests/series/test_dtypes.py | 2 +- pandas/tests/series/test_indexing.py | 2 +- pandas/tests/series/test_io.py | 2 +- pandas/tests/series/test_misc_api.py | 2 +- pandas/tests/series/test_missing.py | 18 +- pandas/tests/series/test_operators.py | 5 +- pandas/tests/series/test_timeseries.py | 6 +- pandas/tests/test_algos.py | 24 +- pandas/tests/test_base.py | 101 ++-- pandas/tests/test_categorical.py | 295 ++++++----- pandas/tests/test_expressions.py | 12 +- pandas/tests/test_generic.py | 14 +- pandas/tests/test_graphics.py | 10 +- pandas/tests/test_groupby.py | 104 ++-- pandas/tests/test_internals.py | 74 +-- pandas/tests/test_multilevel.py | 75 ++- pandas/tests/test_nanops.py | 4 +- pandas/tests/test_panel.py | 24 +- pandas/tests/test_panel4d.py | 12 +- pandas/tests/test_strings.py | 177 ++++--- pandas/tests/test_testing.py | 21 +- pandas/tests/test_tseries.py | 39 +- pandas/tests/test_window.py | 414 +++++++-------- pandas/tools/tests/test_concat.py | 6 +- pandas/tools/tests/test_merge.py | 27 +- pandas/tools/tests/test_tile.py | 69 +-- pandas/tools/tests/test_util.py | 17 +- pandas/tseries/tests/test_base.py | 12 +- pandas/tseries/tests/test_daterange.py | 66 +-- pandas/tseries/tests/test_offsets.py | 2 +- pandas/tseries/tests/test_period.py | 128 +++-- pandas/tseries/tests/test_plotting.py | 4 +- pandas/tseries/tests/test_resample.py | 14 +- pandas/tseries/tests/test_timedeltas.py | 58 ++- pandas/tseries/tests/test_timeseries.py | 332 ++++++------ .../tseries/tests/test_timeseries_legacy.py | 18 +- pandas/tseries/tests/test_timezones.py | 92 ++-- pandas/tseries/tests/test_tslib.py | 5 +- pandas/util/testing.py | 85 +-- 88 files changed, 2332 insertions(+), 2178 deletions(-) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 023519fd7fc20..aaafcb5b41645 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -185,6 +185,16 @@ def test_chained_cmp_op(self): mids, cmp_ops, self.rhses): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + def check_equal(self, result, expected): + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + elif isinstance(result, Series): + tm.assert_series_equal(result, expected) + elif isinstance(result, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + else: + self.assertEqual(result, expected) + def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): skip_these = _scalar_skip ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, @@ -218,7 +228,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): expected = _eval_single_bin( lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + self.check_equal(result, expected) def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): skip_these = _scalar_skip @@ -249,7 +259,7 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + self.check_equal(result, expected) def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) @@ -293,7 +303,7 @@ def check_floor_division(self, lhs, arith1, rhs): if self.engine == 'python': res = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs // rhs - tm.assert_numpy_array_equal(res, expected) + self.check_equal(res, expected) else: self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, 'rhs': rhs}, diff --git a/pandas/core/common.py b/pandas/core/common.py index 1be6ce810791b..03fe71d4f5125 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -316,8 +316,8 @@ def array_equivalent(left, right, strict_nan=False): if not strict_nan: # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object( - _ensure_object(left.ravel()), _ensure_object(right.ravel())) + return lib.array_equivalent_object(_ensure_object(left.ravel()), + _ensure_object(right.ravel())) for left_value, right_value in zip(left, right): if left_value is tslib.NaT and right_value is not tslib.NaT: diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index cad469de86fe9..43b8d6b9563f1 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -99,8 +99,8 @@ def test_frame_non_unique_index(self): assert_frame_equal(df, read_json(df.to_json(orient='split'), orient='split')) unser = read_json(df.to_json(orient='records'), orient='records') - self.assertTrue(df.columns.equals(unser.columns)) - tm.assert_numpy_array_equal(df.values, unser.values) + self.assert_index_equal(df.columns, unser.columns) + np.testing.assert_equal(df.values, unser.values) unser = read_json(df.to_json(orient='values'), orient='values') tm.assert_numpy_array_equal(df.values, unser.values) @@ -183,7 +183,8 @@ def _check_orient(df, orient, dtype=None, numpy=False, # index is not captured in this orientation assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) - self.assertTrue(df.columns.equals(unser.columns)) + self.assert_index_equal(df.columns, unser.columns, + exact=check_column_type) elif orient == "values": # index and cols are not captured in this orientation if numpy is True and df.shape == (0, 0): @@ -302,12 +303,10 @@ def _check_all_orients(df, dtype=None, convert_axes=True, # mixed data index = pd.Index(['a', 'b', 'c', 'd', 'e']) - data = { - 'A': [0., 1., 2., 3., 4.], - 'B': [0., 1., 0., 1., 0.], - 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], - 'D': [True, False, True, False, True] - } + data = {'A': [0., 1., 2., 3., 4.], + 'B': [0., 1., 0., 1., 0.], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': [True, False, True, False, True]} df = DataFrame(data=data, index=index) _check_orient(df, "split", check_dtype=False) _check_orient(df, "records", check_dtype=False) diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 8e4b492c984f1..13b2dafec9c89 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -1201,19 +1201,19 @@ def testDataFrame(self): # column indexed outp = DataFrame(ujson.decode(ujson.encode(df))) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"))) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) outp = DataFrame(ujson.decode(ujson.encode(df, orient="records"))) outp.index = df.index self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) + tm.assert_index_equal(df.columns, outp.columns) outp = DataFrame(ujson.decode(ujson.encode(df, orient="values"))) outp.index = df.index @@ -1221,8 +1221,8 @@ def testDataFrame(self): outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"))) self.assertTrue((df.transpose() == outp).values.all()) - tm.assert_numpy_array_equal(df.transpose().columns, outp.columns) - tm.assert_numpy_array_equal(df.transpose().index, outp.index) + tm.assert_index_equal(df.transpose().columns, outp.columns) + tm.assert_index_equal(df.transpose().index, outp.index) def testDataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ @@ -1231,21 +1231,21 @@ def testDataFrameNumpy(self): # column indexed outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), numpy=True)) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) - outp = DataFrame(ujson.decode( - ujson.encode(df, orient="index"), numpy=True)) + outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), + numpy=True)) self.assertTrue((df.transpose() == outp).values.all()) - tm.assert_numpy_array_equal(df.transpose().columns, outp.columns) - tm.assert_numpy_array_equal(df.transpose().index, outp.index) + tm.assert_index_equal(df.transpose().columns, outp.columns) + tm.assert_index_equal(df.transpose().index, outp.index) def testDataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ @@ -1285,20 +1285,20 @@ def testDataFrameNumpyLabelled(self): outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) self.assertTrue((df.T == outp).values.all()) - tm.assert_numpy_array_equal(df.T.columns, outp.columns) - tm.assert_numpy_array_equal(df.T.index, outp.index) + tm.assert_index_equal(df.T.columns, outp.columns) + tm.assert_index_equal(df.T.index, outp.index) outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) outp.index = df.index self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) + tm.assert_index_equal(df.columns, outp.columns) outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) self.assertTrue((df == outp).values.all()) - tm.assert_numpy_array_equal(df.columns, outp.columns) - tm.assert_numpy_array_equal(df.index, outp.index) + tm.assert_index_equal(df.columns, outp.columns) + tm.assert_index_equal(df.index, outp.index) def testSeries(self): s = Series([10, 20, 30, 40, 50, 60], name="series", @@ -1378,42 +1378,46 @@ def testIndex(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed - outp = Index(ujson.decode(ujson.encode(i))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i)), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i), numpy=True), name='index') + tm.assert_index_equal(i, outp) dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"))) outp = Index(**dec) - self.assertTrue(i.equals(outp)) + tm.assert_index_equal(i, outp) self.assertTrue(i.name == outp.name) dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), numpy=True)) outp = Index(**dec) - self.assertTrue(i.equals(outp)) + tm.assert_index_equal(i, outp) self.assertTrue(i.name == outp.name) - outp = Index(ujson.decode(ujson.encode(i, orient="values"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="values")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode( - i, orient="values"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="values"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="records"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="records")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode( - i, orient="records"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="records"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="index"))) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="index")), + name='index') + tm.assert_index_equal(i, outp) - outp = Index(ujson.decode(ujson.encode(i, orient="index"), numpy=True)) - self.assertTrue(i.equals(outp)) + outp = Index(ujson.decode(ujson.encode(i, orient="index"), + numpy=True), name='index') + tm.assert_index_equal(i, outp) def test_datetimeindex(self): from pandas.tseries.index import date_range @@ -1423,7 +1427,7 @@ def test_datetimeindex(self): encoded = ujson.encode(rng, date_unit='ns') decoded = DatetimeIndex(np.array(ujson.decode(encoded))) - self.assertTrue(rng.equals(decoded)) + tm.assert_index_equal(rng, decoded) ts = Series(np.random.randn(len(rng)), index=rng) decoded = Series(ujson.decode(ujson.encode(ts, date_unit='ns'))) diff --git a/pandas/io/tests/parser/comment.py b/pandas/io/tests/parser/comment.py index 07fc6a167a6c0..f7cd1e190ec16 100644 --- a/pandas/io/tests/parser/comment.py +++ b/pandas/io/tests/parser/comment.py @@ -19,14 +19,14 @@ def test_comment(self): 1,2.,4.#hello world 5.,NaN,10.0 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df = self.read_table(StringIO(data), sep=',', comment='#', na_values=['NaN']) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_line_comment(self): data = """# empty @@ -35,10 +35,10 @@ def test_line_comment(self): #ignore this line 5.,NaN,10.0 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) # check with delim_whitespace=True df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', @@ -48,11 +48,11 @@ def test_line_comment(self): # custom line terminator is not supported # with the Python parser yet if self.engine == 'c': - expected = [[1., 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', lineterminator='*') - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_skiprows(self): data = """# empty @@ -64,9 +64,9 @@ def test_comment_skiprows(self): 5.,NaN,10.0 """ # this should ignore the first four lines (including comments) - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', skiprows=4) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_header(self): data = """# empty @@ -77,9 +77,9 @@ def test_comment_header(self): 5.,NaN,10.0 """ # header should begin at the second non-comment line - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', header=1) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_comment_skiprows_header(self): data = """# empty @@ -94,9 +94,9 @@ def test_comment_skiprows_header(self): # skiprows should skip the first 4 lines (including comments), while # header should start from the second non-commented line starting # with line 5 - expected = [[1., 2., 4.], [5., np.nan, 10.]] + expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_custom_comment_char(self): data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 2be0c4edb8f5d..14f4de853e118 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -232,14 +232,14 @@ def test_unnamed_columns(self): 6,7,8,9,10 11,12,13,14,15 """ - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) df = self.read_table(StringIO(data), sep=',') tm.assert_almost_equal(df.values, expected) - self.assert_numpy_array_equal(df.columns, - ['A', 'B', 'C', 'Unnamed: 3', - 'Unnamed: 4']) + self.assert_index_equal(df.columns, + Index(['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4'])) def test_duplicate_columns(self): # TODO: add test for condition 'mangle_dupe_cols=False' @@ -275,7 +275,7 @@ def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) df2 = self.read_table(self.csv1, sep=',', index_col=0, parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) + self.assert_index_equal(df.columns, pd.Index(['A', 'B', 'C', 'D'])) self.assertEqual(df.index.name, 'index') self.assertIsInstance( df.index[0], (datetime, np.datetime64, Timestamp)) @@ -286,12 +286,12 @@ def test_read_csv_no_index_name(self): df = self.read_csv(self.csv2, index_col=0, parse_dates=True) df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) - self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) - self.assertIsInstance( - df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.ix[ - :, ['A', 'B', 'C', 'D'] - ].values.dtype, np.float64) + self.assert_index_equal(df.columns, + pd.Index(['A', 'B', 'C', 'D', 'E'])) + self.assertIsInstance(df.index[0], + (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, + np.float64) tm.assert_frame_equal(df, df2) def test_read_table_unicode(self): @@ -1121,21 +1121,21 @@ def test_empty_lines(self): -70,.4,1 """ - expected = [[1., 2., 4.], - [5., np.nan, 10.], - [-70., .4, 1.]] + expected = np.array([[1., 2., 4.], + [5., np.nan, 10.], + [-70., .4, 1.]]) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df = self.read_csv(StringIO(data.replace(',', ' ')), sep='\s+') - tm.assert_almost_equal(df.values, expected) - expected = [[1., 2., 4.], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan], - [5., np.nan, 10.], - [np.nan, np.nan, np.nan], - [-70., .4, 1.]] + tm.assert_numpy_array_equal(df.values, expected) + expected = np.array([[1., 2., 4.], + [np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan], + [5., np.nan, 10.], + [np.nan, np.nan, np.nan], + [-70., .4, 1.]]) df = self.read_csv(StringIO(data), skip_blank_lines=False) - tm.assert_almost_equal(list(df.values), list(expected)) + tm.assert_numpy_array_equal(df.values, expected) def test_whitespace_lines(self): data = """ @@ -1146,10 +1146,10 @@ def test_whitespace_lines(self): \t 1,2.,4. 5.,NaN,10.0 """ - expected = [[1, 2., 4.], - [5., np.nan, 10.]] + expected = np.array([[1, 2., 4.], + [5., np.nan, 10.]]) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_regex_separator(self): # see gh-6607 diff --git a/pandas/io/tests/parser/header.py b/pandas/io/tests/parser/header.py index e3c408f0af907..ca148b373d659 100644 --- a/pandas/io/tests/parser/header.py +++ b/pandas/io/tests/parser/header.py @@ -43,14 +43,14 @@ def test_no_header_prefix(self): df_pref = self.read_table(StringIO(data), sep=',', prefix='Field', header=None) - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) tm.assert_almost_equal(df_pref.values, expected) - self.assert_numpy_array_equal( - df_pref.columns, ['Field0', 'Field1', 'Field2', - 'Field3', 'Field4']) + self.assert_index_equal(df_pref.columns, + Index(['Field0', 'Field1', 'Field2', + 'Field3', 'Field4'])) def test_header_with_index_col(self): data = """foo,1,2,3 @@ -262,14 +262,14 @@ def test_no_header(self): names = ['foo', 'bar', 'baz', 'quux', 'panda'] df2 = self.read_table(StringIO(data), sep=',', names=names) - expected = [[1, 2, 3, 4, 5.], - [6, 7, 8, 9, 10], - [11, 12, 13, 14, 15]] + expected = np.array([[1, 2, 3, 4, 5], + [6, 7, 8, 9, 10], + [11, 12, 13, 14, 15]], dtype=np.int64) tm.assert_almost_equal(df.values, expected) tm.assert_almost_equal(df.values, df2.values) - self.assert_numpy_array_equal(df_pref.columns, - ['X0', 'X1', 'X2', 'X3', 'X4']) - self.assert_numpy_array_equal(df.columns, lrange(5)) + self.assert_index_equal(df_pref.columns, + Index(['X0', 'X1', 'X2', 'X3', 'X4'])) + self.assert_index_equal(df.columns, Index(lrange(5))) - self.assert_numpy_array_equal(df2.columns, names) + self.assert_index_equal(df2.columns, Index(names)) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 853e6242751c9..c34549835cb46 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -37,9 +37,10 @@ def test_detect_string_na(self): NA,baz NaN,nan """ - expected = [['foo', 'bar'], [nan, 'baz'], [nan, nan]] + expected = np.array([['foo', 'bar'], [nan, 'baz'], [nan, nan]], + dtype=np.object_) df = self.read_csv(StringIO(data)) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) def test_non_string_na_values(self): # see gh-3611, na_values that are not a string are an issue @@ -126,20 +127,20 @@ def test_custom_na_values(self): -1.#IND,5,baz 7,8,NaN """ - expected = [[1., nan, 3], - [nan, 5, nan], - [7, 8, nan]] + expected = np.array([[1., nan, 3], + [nan, 5, nan], + [7, 8, nan]]) df = self.read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) - tm.assert_almost_equal(df.values, expected) + tm.assert_numpy_array_equal(df.values, expected) df2 = self.read_table(StringIO(data), sep=',', na_values=['baz'], skiprows=[1]) - tm.assert_almost_equal(df2.values, expected) + tm.assert_numpy_array_equal(df2.values, expected) df3 = self.read_table(StringIO(data), sep=',', na_values='baz', skiprows=[1]) - tm.assert_almost_equal(df3.values, expected) + tm.assert_numpy_array_equal(df3.values, expected) def test_bool_na_values(self): data = """A,B,C diff --git a/pandas/io/tests/parser/python_parser_only.py b/pandas/io/tests/parser/python_parser_only.py index 7d1793c429f4e..a08cb36c13f80 100644 --- a/pandas/io/tests/parser/python_parser_only.py +++ b/pandas/io/tests/parser/python_parser_only.py @@ -40,7 +40,8 @@ def test_sniff_delimiter(self): baz|7|8|9 """ data = self.read_csv(StringIO(text), index_col=0, sep=None) - self.assertTrue(data.index.equals(Index(['foo', 'bar', 'baz']))) + self.assert_index_equal(data.index, + Index(['foo', 'bar', 'baz'], name='index')) data2 = self.read_csv(StringIO(text), index_col=0, delimiter='|') tm.assert_frame_equal(data, data2) diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index 5599188400368..11b10211650d6 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -217,8 +217,8 @@ def test_comment_fwf(self): 1 2. 4 #hello world 5 NaN 10.0 """ - expected = [[1, 2., 4], - [5, np.nan, 10.]] + expected = np.array([[1, 2., 4], + [5, np.nan, 10.]]) df = read_fwf(StringIO(data), colspecs=[(0, 3), (4, 9), (9, 25)], comment='#') tm.assert_almost_equal(df.values, expected) @@ -228,8 +228,8 @@ def test_1000_fwf(self): 1 2,334.0 5 10 13 10. """ - expected = [[1, 2334., 5], - [10, 13, 10]] + expected = np.array([[1, 2334., 5], + [10, 13, 10]]) df = read_fwf(StringIO(data), colspecs=[(0, 3), (3, 11), (12, 16)], thousands=',') tm.assert_almost_equal(df.values, expected) diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index f3de604f1ec48..c35cfca7012d3 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -76,8 +76,12 @@ def test_skipinitialspace(self): header=None) result = reader.read() - self.assert_numpy_array_equal(result[0], ['a', 'a', 'a', 'a']) - self.assert_numpy_array_equal(result[1], ['b', 'b', 'b', 'b']) + self.assert_numpy_array_equal(result[0], + np.array(['a', 'a', 'a', 'a'], + dtype=np.object_)) + self.assert_numpy_array_equal(result[1], + np.array(['b', 'b', 'b', 'b'], + dtype=np.object_)) def test_parse_booleans(self): data = 'True\nFalse\nTrue\nTrue' @@ -94,8 +98,10 @@ def test_delimit_whitespace(self): header=None) result = reader.read() - self.assert_numpy_array_equal(result[0], ['a', 'a', 'a']) - self.assert_numpy_array_equal(result[1], ['b', 'b', 'b']) + self.assert_numpy_array_equal(result[0], np.array(['a', 'a', 'a'], + dtype=np.object_)) + self.assert_numpy_array_equal(result[1], np.array(['b', 'b', 'b'], + dtype=np.object_)) def test_embedded_newline(self): data = 'a\n"hello\nthere"\nthis' @@ -103,7 +109,7 @@ def test_embedded_newline(self): reader = TextReader(StringIO(data), header=None) result = reader.read() - expected = ['a', 'hello\nthere', 'this'] + expected = np.array(['a', 'hello\nthere', 'this'], dtype=np.object_) self.assert_numpy_array_equal(result[0], expected) def test_euro_decimal(self): @@ -113,7 +119,7 @@ def test_euro_decimal(self): decimal=',', header=None) result = reader.read() - expected = [12345.67, 345.678] + expected = np.array([12345.67, 345.678]) tm.assert_almost_equal(result[0], expected) def test_integer_thousands(self): @@ -123,7 +129,7 @@ def test_integer_thousands(self): thousands=',', header=None) result = reader.read() - expected = [123456, 12500] + expected = np.array([123456, 12500], dtype=np.int64) tm.assert_almost_equal(result[0], expected) def test_integer_thousands_alt(self): diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index b056f34b5f00e..5a95fe7727df0 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -519,7 +519,7 @@ def test_nyse_wsj_commas_table(self): 'Volume', 'Price', 'Chg', '% Chg']) nrows = 100 self.assertEqual(df.shape[0], nrows) - self.assertTrue(df.columns.equals(columns)) + self.assert_index_equal(df.columns, columns) @tm.slow def test_banklist_header(self): diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 7c61a6942e8e7..b647ec6b25717 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -150,7 +150,11 @@ def test_scalar_complex(self): def test_list_numpy_float(self): x = [np.float32(np.random.rand()) for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) def test_list_numpy_float_complex(self): if not hasattr(np, 'complex128'): @@ -165,7 +169,11 @@ def test_list_numpy_float_complex(self): def test_list_float(self): x = [np.random.rand() for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) def test_list_float_complex(self): x = [np.random.rand() for i in range(5)] + \ @@ -217,7 +225,11 @@ def test_numpy_array_complex(self): def test_list_mixed(self): x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x, x_rec) + # current msgpack cannot distinguish list/tuple + tm.assert_almost_equal(tuple(x), x_rec) + + x_rec = self.encode_decode(tuple(x)) + tm.assert_almost_equal(tuple(x), x_rec) class TestBasic(TestPackers): @@ -286,30 +298,30 @@ def test_basic_index(self): for s, i in self.d.items(): i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) # datetime with no freq (GH5506) i = Index([Timestamp('20130101'), Timestamp('20130103')]) i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) # datetime with timezone i = Index([Timestamp('20130101 9:00:00'), Timestamp( '20130103 11:00:00')]).tz_localize('US/Eastern') i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) def test_multi_index(self): for s, i in self.mi.items(): i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) def test_unicode(self): i = tm.makeUnicodeIndex(100) i_rec = self.encode_decode(i) - self.assertTrue(i.equals(i_rec)) + self.assert_index_equal(i, i_rec) class TestSeries(TestPackers): diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 7f2813d5281cb..c12d6e02e3a2e 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -85,7 +85,7 @@ def compare_series_ts(self, result, expected, typ, version): tm.assert_series_equal(result, expected) tm.assert_equal(result.index.freq, expected.index.freq) tm.assert_equal(result.index.freq.normalize, False) - tm.assert_numpy_array_equal(result > 0, expected > 0) + tm.assert_series_equal(result > 0, expected > 0) # GH 9291 freq = result.index.freq diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4c72a47dbdf6e..96b66265ea586 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -5280,7 +5280,7 @@ def test_fixed_offset_tz(self): with ensure_clean_store(self.path) as store: store['frame'] = frame recons = store['frame'] - self.assertTrue(recons.index.equals(rng)) + self.assert_index_equal(recons.index, rng) self.assertEqual(rng.tz, recons.index.tz) def test_store_timezone(self): diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index fde4ad15e1185..43d35a4e7f72e 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -97,8 +97,11 @@ def test_constructor(self): # constructed zframe from matrix above self.assertEqual(self.zframe['A'].fill_value, 0) - tm.assert_almost_equal([0, 0, 0, 0, 1, 2, 3, 4, 5, 6], - self.zframe['A'].values) + tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]), + self.zframe['A'].values) + tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., + 3., 4., 5., 6.]), + self.zframe['A'].to_dense().values) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) @@ -380,8 +383,8 @@ def test_set_value(self): res2 = res.set_value('foobar', 'qux', 1.5) self.assertIsNot(res2, res) - self.assert_numpy_array_equal(res2.columns, - list(self.frame.columns) + ['qux']) + self.assert_index_equal(res2.columns, + pd.Index(list(self.frame.columns) + ['qux'])) self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) def test_fancy_index_misc(self): @@ -407,7 +410,7 @@ def test_getitem_overload(self): subindex = self.frame.index[indexer] subframe = self.frame[indexer] - self.assert_numpy_array_equal(subindex, subframe.index) + self.assert_index_equal(subindex, subframe.index) self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 6edae66d4e55b..11bf980a99fec 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -50,8 +50,10 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): yindex = BlockIndex(TEST_LENGTH, yloc, ylen) bresult = xindex.make_union(yindex) assert (isinstance(bresult, BlockIndex)) - tm.assert_numpy_array_equal(bresult.blocs, eloc) - tm.assert_numpy_array_equal(bresult.blengths, elen) + tm.assert_numpy_array_equal(bresult.blocs, + np.array(eloc, dtype=np.int32)) + tm.assert_numpy_array_equal(bresult.blengths, + np.array(elen, dtype=np.int32)) ixindex = xindex.to_int_index() iyindex = yindex.to_int_index() @@ -411,7 +413,8 @@ def test_to_int_index(self): block = BlockIndex(20, locs, lengths) dense = block.to_int_index() - tm.assert_numpy_array_equal(dense.indices, exp_inds) + tm.assert_numpy_array_equal(dense.indices, + np.array(exp_inds, dtype=np.int32)) def test_to_block_index(self): index = BlockIndex(10, [0, 5], [4, 5]) diff --git a/pandas/sparse/tests/test_panel.py b/pandas/sparse/tests/test_panel.py index 89a90f5be40e6..e988ddebd92f0 100644 --- a/pandas/sparse/tests/test_panel.py +++ b/pandas/sparse/tests/test_panel.py @@ -121,7 +121,8 @@ def _compare_with_dense(panel): dlp = panel.to_dense().to_frame() self.assert_numpy_array_equal(slp.values, dlp.values) - self.assertTrue(slp.index.equals(dlp.index)) + self.assert_index_equal(slp.index, dlp.index, + check_names=False) _compare_with_dense(self.panel) _compare_with_dense(self.panel.reindex(items=['ItemA'])) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 58e3dfbdf66e4..27112319ea915 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -294,7 +294,7 @@ def test_constructor_ndarray(self): def test_constructor_nonnan(self): arr = [0, 0, 0, nan, nan] sp_series = SparseSeries(arr, fill_value=0) - tm.assert_numpy_array_equal(sp_series.values.values, arr) + tm.assert_numpy_array_equal(sp_series.values.values, np.array(arr)) self.assertEqual(len(sp_series), 5) self.assertEqual(sp_series.shape, (5, )) @@ -726,9 +726,9 @@ def test_dropna(self): expected = sp.to_dense().valid() expected = expected[expected != 0] - - tm.assert_almost_equal(sp_valid.values, expected.values) - self.assertTrue(sp_valid.index.equals(expected.index)) + exp_arr = pd.SparseArray(expected.values, fill_value=0, kind='block') + tm.assert_sp_array_equal(sp_valid.values, exp_arr) + self.assert_index_equal(sp_valid.index, expected.index) self.assertEqual(len(sp_valid.sp_values), 2) result = self.bseries.dropna() @@ -1042,8 +1042,7 @@ def _run_test(self, ss, kwargs, check): results = (results[0].T, results[2], results[1]) self._check_results_to_coo(results, check) - @staticmethod - def _check_results_to_coo(results, check): + def _check_results_to_coo(self, results, check): (A, il, jl) = results (A_result, il_result, jl_result) = check # convert to dense and compare @@ -1051,8 +1050,8 @@ def _check_results_to_coo(results, check): # or compare directly as difference of sparse # assert(abs(A - A_result).max() < 1e-12) # max is failing in python # 2.6 - tm.assert_numpy_array_equal(il, il_result) - tm.assert_numpy_array_equal(jl, jl_result) + self.assertEqual(il, il_result) + self.assertEqual(jl, jl_result) def test_concat(self): val1 = np.array([1, 2, np.nan, np.nan, 0, np.nan]) diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py index 2c69eb64fd61d..706becfa730c4 100644 --- a/pandas/stats/tests/test_fama_macbeth.py +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -50,7 +50,9 @@ def checkFamaMacBethExtended(self, window_type, x, y, **kwds): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): reference = fama_macbeth(y=y2, x=x2, **kwds) - assert_almost_equal(reference._stats, result._stats[:, i]) + # reference._stats is tuple + assert_almost_equal(reference._stats, result._stats[:, i], + check_dtype=False) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): static = fama_macbeth(y=y2, x=x2, **kwds) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 4932ac8ffdf99..bac824f0b4840 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -378,7 +378,7 @@ def test_predict_longer_exog(self): model = ols(y=endog, x=exog) pred = model.y_predict - self.assertTrue(pred.index.equals(exog.index)) + self.assert_index_equal(pred.index, exog.index) def test_longpanel_series_combo(self): wp = tm.makePanel() @@ -527,13 +527,12 @@ def testFiltering(self): index = x.index.get_level_values(0) index = Index(sorted(set(index))) exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) - self.assertTrue - (exp_index.equals(index)) + self.assert_index_equal(exp_index, index) index = x.index.get_level_values(1) index = Index(sorted(set(index))) exp_index = Index(['A', 'B']) - self.assertTrue(exp_index.equals(index)) + self.assert_index_equal(exp_index, index) x = result._x_filtered index = x.index.get_level_values(0) @@ -541,24 +540,22 @@ def testFiltering(self): exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3), datetime(2000, 1, 4)]) - self.assertTrue(exp_index.equals(index)) + self.assert_index_equal(exp_index, index) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) - exp_x = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1]] + exp_x = np.array([[6, 14, 1], [9, 17, 1], + [30, 48, 1]], dtype=np.float64) assert_almost_equal(exp_x, result._x.values) - exp_x_filtered = [[6, 14, 1], - [9, 17, 1], - [30, 48, 1], - [11, 20, 1], - [12, 21, 1]] + exp_x_filtered = np.array([[6, 14, 1], [9, 17, 1], [30, 48, 1], + [11, 20, 1], [12, 21, 1]], dtype=np.float64) assert_almost_equal(exp_x_filtered, result._x_filtered.values) - self.assertTrue(result._x_filtered.index.levels[0].equals( - result.y_fitted.index)) + self.assert_index_equal(result._x_filtered.index.levels[0], + result.y_fitted.index) def test_wls_panel(self): y = tm.makeTimeDataFrame() @@ -597,9 +594,11 @@ def testWithTimeEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) - assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5]) + # .flat is flatiter instance + assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5], + check_dtype=False) - exp_x = [[0, 0], [-10.5, -15.5], [10.5, 15.5]] + exp_x = np.array([[0, 0], [-10.5, -15.5], [10.5, 15.5]]) assert_almost_equal(result._x_trans.values, exp_x) # _check_non_raw_results(result) @@ -608,7 +607,9 @@ def testWithEntityEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[0., 6., 14., 1.], [0, 9, 17, 1], [1, 30, 48, 1]], index=result._x.index, columns=['FE_B', 'x1', 'x2', @@ -622,7 +623,9 @@ def testWithEntityEffectsAndDroppedDummies(self): result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True, dropped_dummies={'entity': 'B'}) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[1., 6., 14., 1.], [1, 9, 17, 1], [0, 30, 48, 1]], index=result._x.index, columns=['FE_A', 'x1', 'x2', 'intercept'], @@ -634,7 +637,9 @@ def testWithXEffects(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) res = result._x exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], @@ -648,7 +653,9 @@ def testWithXEffectsAndDroppedDummies(self): dropped_dummies={'x1': 30}) res = result._x - assert_almost_equal(result._y.values.flat, [1, 4, 5]) + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 4, 5], + check_dtype=False) exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], columns=['x1_6', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) @@ -660,13 +667,15 @@ def testWithXEffectsAndConversion(self): result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2']) - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) - exp_x = [[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], - [0, 0, 0, 1, 1]] + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], + check_dtype=False) + exp_x = np.array([[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], + [0, 0, 0, 1, 1]], dtype=np.float64) assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) - self.assertTrue(exp_index.equals(result._x.columns)) + self.assert_index_equal(exp_index, result._x.columns) # _check_non_raw_results(result) @@ -674,14 +683,15 @@ def testWithXEffectsAndConversionAndDroppedDummies(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2'], dropped_dummies={'x2': 'foo'}) - - assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) - exp_x = [[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], - [0, 0, 0, 0, 1]] + # .flat is flatiter instance + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4], + check_dtype=False) + exp_x = np.array([[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], + [0, 0, 0, 0, 1]], dtype=np.float64) assert_almost_equal(result._x.values, exp_x) exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) - self.assertTrue(exp_index.equals(result._x.columns)) + self.assert_index_equal(exp_index, result._x.columns) # _check_non_raw_results(result) @@ -914,16 +924,21 @@ def setUp(self): def testFilterWithSeriesRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, {'x1': self.TS2}, None) - self.tsAssertEqual(self.TS1, lhs) - self.tsAssertEqual(self.TS2[:3], rhs['x1']) - self.tsAssertEqual(self.TS2, rhs_pre['x1']) + self.tsAssertEqual(self.TS1.astype(np.float64), lhs, check_names=False) + self.tsAssertEqual(self.TS2[:3].astype(np.float64), rhs['x1'], + check_names=False) + self.tsAssertEqual(self.TS2.astype(np.float64), rhs_pre['x1'], + check_names=False) def testFilterWithSeriesRHS2(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS2, {'x1': self.TS1}, None) - self.tsAssertEqual(self.TS2[:3], lhs) - self.tsAssertEqual(self.TS1, rhs['x1']) - self.tsAssertEqual(self.TS1, rhs_pre['x1']) + self.tsAssertEqual(self.TS2[:3].astype(np.float64), lhs, + check_names=False) + self.tsAssertEqual(self.TS1.astype(np.float64), rhs['x1'], + check_names=False) + self.tsAssertEqual(self.TS1.astype(np.float64), rhs_pre['x1'], + check_names=False) def testFilterWithSeriesRHS3(self): (lhs, rhs, weights, rhs_pre, @@ -931,32 +946,32 @@ def testFilterWithSeriesRHS3(self): exp_lhs = self.TS3[2:3] exp_rhs = self.TS4[2:3] exp_rhs_pre = self.TS4[1:] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs, rhs['x1']) - self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1'], check_names=False) def testFilterWithDataFrameRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, self.DF1, None) - exp_lhs = self.TS1[1:] + exp_lhs = self.TS1[1:].astype(np.float64) exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) + exp_rhs2 = self.TS4[1:3].astype(np.float64) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) def testFilterWithDictRHS(self): (lhs, rhs, weights, rhs_pre, index, valid) = _filter_data(self.TS1, self.DICT1, None) - exp_lhs = self.TS1[1:] - exp_rhs1 = self.TS2[1:3] - exp_rhs2 = self.TS4[1:3] - self.tsAssertEqual(exp_lhs, lhs) - self.tsAssertEqual(exp_rhs1, rhs['x1']) - self.tsAssertEqual(exp_rhs2, rhs['x2']) - - def tsAssertEqual(self, ts1, ts2): - self.assert_numpy_array_equal(ts1, ts2) + exp_lhs = self.TS1[1:].astype(np.float64) + exp_rhs1 = self.TS2[1:3].astype(np.float64) + exp_rhs2 = self.TS4[1:3].astype(np.float64) + self.tsAssertEqual(exp_lhs, lhs, check_names=False) + self.tsAssertEqual(exp_rhs1, rhs['x1'], check_names=False) + self.tsAssertEqual(exp_rhs2, rhs['x2'], check_names=False) + + def tsAssertEqual(self, ts1, ts2, **kwargs): + self.assert_series_equal(ts1, ts2, **kwargs) if __name__ == '__main__': diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 1da5487aefc01..3b50dd2c1d49f 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -330,28 +330,30 @@ def test_rename(self): # gets sorted alphabetical df = DataFrame(data) renamed = df.rename(index={'foo': 'bar', 'bar': 'foo'}) - self.assert_numpy_array_equal(renamed.index, ['foo', 'bar']) + tm.assert_index_equal(renamed.index, pd.Index(['foo', 'bar'])) renamed = df.rename(index=str.upper) - self.assert_numpy_array_equal(renamed.index, ['BAR', 'FOO']) + tm.assert_index_equal(renamed.index, pd.Index(['BAR', 'FOO'])) # have to pass something self.assertRaises(TypeError, self.frame.rename) # partial columns renamed = self.frame.rename(columns={'C': 'foo', 'D': 'bar'}) - self.assert_numpy_array_equal( - renamed.columns, ['A', 'B', 'foo', 'bar']) + tm.assert_index_equal(renamed.columns, + pd.Index(['A', 'B', 'foo', 'bar'])) # other axis renamed = self.frame.T.rename(index={'C': 'foo', 'D': 'bar'}) - self.assert_numpy_array_equal(renamed.index, ['A', 'B', 'foo', 'bar']) + tm.assert_index_equal(renamed.index, + pd.Index(['A', 'B', 'foo', 'bar'])) # index with name index = Index(['foo', 'bar'], name='name') renamer = DataFrame(data, index=index) renamed = renamer.rename(index={'foo': 'bar', 'bar': 'foo'}) - self.assert_numpy_array_equal(renamed.index, ['bar', 'foo']) + tm.assert_index_equal(renamed.index, + pd.Index(['bar', 'foo'], name='name')) self.assertEqual(renamed.index.name, renamer.index.name) # MultiIndex @@ -363,12 +365,14 @@ def test_rename(self): renamer = DataFrame([(0, 0), (1, 1)], index=index, columns=columns) renamed = renamer.rename(index={'foo1': 'foo3', 'bar2': 'bar3'}, columns={'fizz1': 'fizz3', 'buzz2': 'buzz3'}) - new_index = MultiIndex.from_tuples( - [('foo3', 'bar1'), ('foo2', 'bar3')]) - new_columns = MultiIndex.from_tuples( - [('fizz3', 'buzz1'), ('fizz2', 'buzz3')]) - self.assert_numpy_array_equal(renamed.index, new_index) - self.assert_numpy_array_equal(renamed.columns, new_columns) + new_index = MultiIndex.from_tuples([('foo3', 'bar1'), + ('foo2', 'bar3')], + names=['foo', 'bar']) + new_columns = MultiIndex.from_tuples([('fizz3', 'buzz1'), + ('fizz2', 'buzz3')], + names=['fizz', 'buzz']) + self.assert_index_equal(renamed.index, new_index) + self.assert_index_equal(renamed.columns, new_columns) self.assertEqual(renamed.index.names, renamer.index.names) self.assertEqual(renamed.columns.names, renamer.columns.names) @@ -460,28 +464,30 @@ def test_reset_index(self): stacked.index.names = [None, None] deleveled2 = stacked.reset_index() - self.assert_numpy_array_equal(deleveled['first'], - deleveled2['level_0']) - self.assert_numpy_array_equal(deleveled['second'], - deleveled2['level_1']) + tm.assert_series_equal(deleveled['first'], deleveled2['level_0'], + check_names=False) + tm.assert_series_equal(deleveled['second'], deleveled2['level_1'], + check_names=False) # default name assigned rdf = self.frame.reset_index() - self.assert_numpy_array_equal(rdf['index'], self.frame.index.values) + exp = pd.Series(self.frame.index.values, name='index') + self.assert_series_equal(rdf['index'], exp) # default name assigned, corner case df = self.frame.copy() df['index'] = 'foo' rdf = df.reset_index() - self.assert_numpy_array_equal(rdf['level_0'], self.frame.index.values) + exp = pd.Series(self.frame.index.values, name='level_0') + self.assert_series_equal(rdf['level_0'], exp) # but this is ok self.frame.index.name = 'index' deleveled = self.frame.reset_index() - self.assert_numpy_array_equal(deleveled['index'], - self.frame.index.values) - self.assert_numpy_array_equal(deleveled.index, - np.arange(len(deleveled))) + self.assert_series_equal(deleveled['index'], + pd.Series(self.frame.index)) + self.assert_index_equal(deleveled.index, + pd.Index(np.arange(len(deleveled)))) # preserve column names self.frame.columns.name = 'columns' diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 20aaae586f14f..b71235a8f6576 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -18,12 +18,6 @@ import pandas.core.nanops as nanops import pandas.formats.printing as printing -from pandas.util.testing import (assert_almost_equal, - assert_equal, - assert_series_equal, - assert_frame_equal, - assertRaisesRegexp) - import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -60,12 +54,12 @@ def _check_method(self, method='pearson', check_minp=False): if not check_minp: correls = self.frame.corr(method=method) exp = self.frame['A'].corr(self.frame['C'], method=method) - assert_almost_equal(correls['A']['C'], exp) + tm.assert_almost_equal(correls['A']['C'], exp) else: result = self.frame.corr(min_periods=len(self.frame) - 8) expected = self.frame.corr() expected.ix['A', 'B'] = expected.ix['B', 'A'] = nan - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corr_non_numeric(self): tm._skip_if_no_scipy() @@ -75,7 +69,7 @@ def test_corr_non_numeric(self): # exclude non-numeric types result = self.mixed_frame.corr() expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].corr() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corr_nooverlap(self): tm._skip_if_no_scipy() @@ -123,14 +117,14 @@ def test_corr_int_and_boolean(self): expected = DataFrame(np.ones((2, 2)), index=[ 'a', 'b'], columns=['a', 'b']) for meth in ['pearson', 'kendall', 'spearman']: - assert_frame_equal(df.corr(meth), expected) + tm.assert_frame_equal(df.corr(meth), expected) def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() result = self.frame.cov(min_periods=len(self.frame)) - assert_frame_equal(expected, result) + tm.assert_frame_equal(expected, result) result = self.frame.cov(min_periods=len(self.frame) + 1) self.assertTrue(isnull(result.values).all()) @@ -149,25 +143,25 @@ def test_cov(self): self.frame['B'][:10] = nan cov = self.frame.cov() - assert_almost_equal(cov['A']['C'], - self.frame['A'].cov(self.frame['C'])) + tm.assert_almost_equal(cov['A']['C'], + self.frame['A'].cov(self.frame['C'])) # exclude non-numeric types result = self.mixed_frame.cov() expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Single column frame df = DataFrame(np.linspace(0.0, 1.0, 10)) result = df.cov() expected = DataFrame(np.cov(df.values.T).reshape((1, 1)), index=df.columns, columns=df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df.ix[0] = np.nan result = df.cov() expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)), index=df.columns, columns=df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_corrwith(self): a = self.tsframe @@ -180,13 +174,13 @@ def test_corrwith(self): del b['B'] colcorr = a.corrwith(b, axis=0) - assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) rowcorr = a.corrwith(b, axis=1) - assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) dropped = a.corrwith(b, axis=0, drop=True) - assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) + tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) self.assertNotIn('B', dropped) dropped = a.corrwith(b, axis=1, drop=True) @@ -199,7 +193,7 @@ def test_corrwith(self): df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) correls = df1.corrwith(df2, axis=1) for row in index[:4]: - assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + tm.assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) def test_corrwith_with_objects(self): df1 = tm.makeTimeDataFrame() @@ -211,17 +205,17 @@ def test_corrwith_with_objects(self): result = df1.corrwith(df2) expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df1.corrwith(df2, axis=1) expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_corrwith_series(self): result = self.tsframe.corrwith(self.tsframe['A']) expected = self.tsframe.apply(self.tsframe['A'].corr) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_corrwith_matches_corrcoef(self): df1 = DataFrame(np.arange(10000), columns=['a']) @@ -229,7 +223,7 @@ def test_corrwith_matches_corrcoef(self): c1 = df1.corrwith(df2)['a'] c2 = np.corrcoef(df1['a'], df2['a'])[0][1] - assert_almost_equal(c1, c2) + tm.assert_almost_equal(c1, c2) self.assertTrue(c1 < 1) def test_bool_describe_in_mixed_frame(self): @@ -246,14 +240,14 @@ def test_bool_describe_in_mixed_frame(self): 10, 20, 30, 40, 50]}, index=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Top value is a boolean value that is False result = df.describe(include=['bool']) expected = DataFrame({'bool_data': [5, 2, False, 3]}, index=['count', 'unique', 'top', 'freq']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_describe_categorical_columns(self): # GH 11558 @@ -310,8 +304,9 @@ def test_reduce_mixed_frame(self): }) df.reindex(columns=['bool_data', 'int_data', 'string_data']) test = df.sum(axis=0) - assert_almost_equal(test.values, [2, 150, 'abcde']) - assert_series_equal(test, df.T.sum(axis=1)) + tm.assert_numpy_array_equal(test.values, + np.array([2, 150, 'abcde'], dtype=object)) + tm.assert_series_equal(test, df.T.sum(axis=1)) def test_count(self): f = lambda s: notnull(s).sum() @@ -333,17 +328,17 @@ def test_count(self): df = DataFrame(index=lrange(10)) result = df.count(1) expected = Series(0, index=df.index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame(columns=lrange(10)) result = df.count(0) expected = Series(0, index=df.columns) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame() result = df.count() expected = Series(0, index=[]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_sum(self): self._check_stat_op('sum', np.sum, has_numeric_only=True) @@ -377,7 +372,7 @@ def test_stat_operators_attempt_obj_array(self): expected = getattr(df.astype('f8'), meth)(1) if not tm._incompat_bottleneck_version(meth): - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_mean(self): self._check_stat_op('mean', np.mean, check_dates=True) @@ -405,12 +400,12 @@ def test_cummin(self): # axis = 0 cummin = self.tsframe.cummin() expected = self.tsframe.apply(Series.cummin) - assert_frame_equal(cummin, expected) + tm.assert_frame_equal(cummin, expected) # axis = 1 cummin = self.tsframe.cummin(axis=1) expected = self.tsframe.apply(Series.cummin, axis=1) - assert_frame_equal(cummin, expected) + tm.assert_frame_equal(cummin, expected) # it works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -428,12 +423,12 @@ def test_cummax(self): # axis = 0 cummax = self.tsframe.cummax() expected = self.tsframe.apply(Series.cummax) - assert_frame_equal(cummax, expected) + tm.assert_frame_equal(cummax, expected) # axis = 1 cummax = self.tsframe.cummax(axis=1) expected = self.tsframe.apply(Series.cummax, axis=1) - assert_frame_equal(cummax, expected) + tm.assert_frame_equal(cummax, expected) # it works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -460,11 +455,11 @@ def test_var_std(self): result = self.tsframe.std(ddof=4) expected = self.tsframe.apply(lambda x: x.std(ddof=4)) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) result = self.tsframe.var(ddof=4) expected = self.tsframe.apply(lambda x: x.var(ddof=4)) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nanvar(arr, axis=0) @@ -489,11 +484,11 @@ def test_numeric_only_flag(self): for meth in methods: result = getattr(df1, meth)(axis=1, numeric_only=True) expected = getattr(df1[['bar', 'baz']], meth)(axis=1) - assert_series_equal(expected, result) + tm.assert_series_equal(expected, result) result = getattr(df2, meth)(axis=1, numeric_only=True) expected = getattr(df2[['bar', 'baz']], meth)(axis=1) - assert_series_equal(expected, result) + tm.assert_series_equal(expected, result) # df1 has all numbers, df2 has a letter inside self.assertRaises(TypeError, lambda: getattr(df1, meth) @@ -509,12 +504,12 @@ def test_cumsum(self): # axis = 0 cumsum = self.tsframe.cumsum() expected = self.tsframe.apply(Series.cumsum) - assert_frame_equal(cumsum, expected) + tm.assert_frame_equal(cumsum, expected) # axis = 1 cumsum = self.tsframe.cumsum(axis=1) expected = self.tsframe.apply(Series.cumsum, axis=1) - assert_frame_equal(cumsum, expected) + tm.assert_frame_equal(cumsum, expected) # works df = DataFrame({'A': np.arange(20)}, index=np.arange(20)) @@ -532,12 +527,12 @@ def test_cumprod(self): # axis = 0 cumprod = self.tsframe.cumprod() expected = self.tsframe.apply(Series.cumprod) - assert_frame_equal(cumprod, expected) + tm.assert_frame_equal(cumprod, expected) # axis = 1 cumprod = self.tsframe.cumprod(axis=1) expected = self.tsframe.apply(Series.cumprod, axis=1) - assert_frame_equal(cumprod, expected) + tm.assert_frame_equal(cumprod, expected) # fix issue cumprod_xs = self.tsframe.cumprod(axis=1) @@ -574,48 +569,48 @@ def test_rank(self): exp1 = np.apply_along_axis(rankdata, 1, fvals) exp1[mask] = np.nan - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # integers df = DataFrame(np.random.randint(0, 5, size=40).reshape((10, 4))) result = df.rank() exp = df.astype(float).rank() - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) result = df.rank(1) exp = df.astype(float).rank(1) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) def test_rank2(self): df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = DataFrame([[1.0, 3.0, 2.0], [1, 2, 3]]) / 3.0 result = df.rank(1, pct=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([[1, 3, 2], [1, 2, 3]]) expected = df.rank(0) / 2.0 result = df.rank(0, pct=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([['b', 'c', 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) result = df.rank(1, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) result = df.rank(0, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame([['b', np.nan, 'a'], ['a', 'c', 'b']]) expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) result = df.rank(1, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) result = df.rank(0, numeric_only=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], @@ -627,12 +622,12 @@ def test_rank2(self): expected = DataFrame([[2., nan, 1.], [2., 3., 1.]]) result = df.rank(1, numeric_only=False, ascending=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = DataFrame([[1., nan, 2.], [2., 1., 3.]]) result = df.rank(1, numeric_only=False, ascending=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # mixed-type frames self.mixed_frame['datetime'] = datetime.now() @@ -640,12 +635,12 @@ def test_rank2(self): result = self.mixed_frame.rank(1) expected = self.mixed_frame.rank(1, numeric_only=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame({"a": [1e-20, -5, 1e-20 + 1e-40, 10, 1e60, 1e80, 1e-30]}) exp = DataFrame({"a": [3.5, 1., 3.5, 5., 6., 7., 2.]}) - assert_frame_equal(df.rank(), exp) + tm.assert_frame_equal(df.rank(), exp) def test_rank_na_option(self): tm._skip_if_no_scipy() @@ -665,8 +660,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, fvals) exp1 = np.apply_along_axis(rankdata, 1, fvals) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # top ranks0 = self.frame.rank(na_option='top') @@ -680,8 +675,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, fval0) exp1 = np.apply_along_axis(rankdata, 1, fval1) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # descending @@ -694,8 +689,8 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, -fvals) exp1 = np.apply_along_axis(rankdata, 1, -fvals) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_almost_equal(ranks0.values, exp0) + tm.assert_almost_equal(ranks1.values, exp1) # descending @@ -711,14 +706,14 @@ def test_rank_na_option(self): exp0 = np.apply_along_axis(rankdata, 0, -fval0) exp1 = np.apply_along_axis(rankdata, 1, -fval1) - assert_almost_equal(ranks0.values, exp0) - assert_almost_equal(ranks1.values, exp1) + tm.assert_numpy_array_equal(ranks0.values, exp0) + tm.assert_numpy_array_equal(ranks1.values, exp1) def test_rank_axis(self): # check if using axes' names gives the same result df = pd.DataFrame([[2, 1], [4, 3]]) - assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) - assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) + tm.assert_frame_equal(df.rank(axis=0), df.rank(axis='index')) + tm.assert_frame_equal(df.rank(axis=1), df.rank(axis='columns')) def test_sem(self): alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x)) @@ -727,7 +722,7 @@ def test_sem(self): result = self.tsframe.sem(ddof=4) expected = self.tsframe.apply( lambda x: x.std(ddof=4) / np.sqrt(len(x))) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) arr = np.repeat(np.random.random((1, 1000)), 1000, 0) result = nanops.nansem(arr, axis=0) @@ -789,7 +784,7 @@ def alt(x): kurt = df.kurt() kurt2 = df.kurt(level=0).xs('bar') - assert_series_equal(kurt, kurt2, check_names=False) + tm.assert_series_equal(kurt, kurt2, check_names=False) self.assertTrue(kurt.name is None) self.assertEqual(kurt2.name, 'bar') @@ -827,26 +822,26 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - assert_series_equal(result0, frame.apply(wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal(result0, frame.apply(wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) # HACK: win32 - assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False, + check_less_precise=check_less_precise) else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper), - check_dtype=check_dtype, - check_less_precise=check_less_precise) + tm.assert_series_equal(result0, frame.apply(skipna_wrapper), + check_dtype=check_dtype, + check_less_precise=check_less_precise) if not tm._incompat_bottleneck_version(name): - assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False, - check_less_precise=check_less_precise) + exp = frame.apply(skipna_wrapper, axis=1) + tm.assert_series_equal(result1, exp, check_dtype=False, + check_less_precise=check_less_precise) # check dtypes if check_dtype: @@ -859,7 +854,7 @@ def wrapper(x): # assert_series_equal(result, comp) # bad axis - assertRaisesRegexp(ValueError, 'No axis named 2', f, axis=2) + tm.assertRaisesRegexp(ValueError, 'No axis named 2', f, axis=2) # make sure works on mixed-type frame getattr(self.mixed_frame, name)(axis=0) getattr(self.mixed_frame, name)(axis=1) @@ -885,20 +880,20 @@ def test_mode(self): "C": [8, 8, 8, 9, 9, 9], "D": np.arange(6, dtype='int64'), "E": [8, 8, 1, 1, 3, 3]}) - assert_frame_equal(df[["A"]].mode(), - pd.DataFrame({"A": [12]})) + tm.assert_frame_equal(df[["A"]].mode(), + pd.DataFrame({"A": [12]})) expected = pd.Series([], dtype='int64', name='D').to_frame() - assert_frame_equal(df[["D"]].mode(), expected) + tm.assert_frame_equal(df[["D"]].mode(), expected) expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame() - assert_frame_equal(df[["E"]].mode(), expected) - assert_frame_equal(df[["A", "B"]].mode(), - pd.DataFrame({"A": [12], "B": [10.]})) - assert_frame_equal(df.mode(), - pd.DataFrame({"A": [12, np.nan, np.nan], - "B": [10, np.nan, np.nan], - "C": [8, 9, np.nan], - "D": [np.nan, np.nan, np.nan], - "E": [1, 3, 8]})) + tm.assert_frame_equal(df[["E"]].mode(), expected) + tm.assert_frame_equal(df[["A", "B"]].mode(), + pd.DataFrame({"A": [12], "B": [10.]})) + tm.assert_frame_equal(df.mode(), + pd.DataFrame({"A": [12, np.nan, np.nan], + "B": [10, np.nan, np.nan], + "C": [8, 9, np.nan], + "D": [np.nan, np.nan, np.nan], + "E": [1, 3, 8]})) # outputs in sorted order df["C"] = list(reversed(df["C"])) @@ -910,7 +905,7 @@ def test_mode(self): "C": [8, 9]})) printing.pprint_thing(a) printing.pprint_thing(b) - assert_frame_equal(a, b) + tm.assert_frame_equal(a, b) # should work with heterogeneous types df = pd.DataFrame({"A": np.arange(6, dtype='int64'), "B": pd.date_range('2011', periods=6), @@ -918,7 +913,7 @@ def test_mode(self): exp = pd.DataFrame({"A": pd.Series([], dtype=df["A"].dtype), "B": pd.Series([], dtype=df["B"].dtype), "C": pd.Series([], dtype=df["C"].dtype)}) - assert_frame_equal(df.mode(), exp) + tm.assert_frame_equal(df.mode(), exp) # and also when not empty df.loc[1, "A"] = 0 @@ -929,7 +924,7 @@ def test_mode(self): dtype=df["B"].dtype), "C": pd.Series(['e'], dtype=df["C"].dtype)}) - assert_frame_equal(df.mode(), exp) + tm.assert_frame_equal(df.mode(), exp) def test_operators_timedelta64(self): from datetime import timedelta @@ -962,8 +957,8 @@ def test_operators_timedelta64(self): result2 = abs(diffs) expected = DataFrame(dict(A=df['A'] - df['C'], B=df['B'] - df['A'])) - assert_frame_equal(result, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected) # mixed frame mixed = diffs.copy() @@ -982,22 +977,22 @@ def test_operators_timedelta64(self): 'foo', 1, 1.0, Timestamp('20130101')], index=mixed.columns) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # excludes numeric result = mixed.min(axis=1) expected = Series([1, 1, 1.], index=[0, 1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # works when only those columns are selected result = mixed[['A', 'B']].min(1) expected = Series([timedelta(days=-1)] * 3) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = mixed[['A', 'B']].min() expected = Series([timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=['A', 'B']) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 3106 df = DataFrame({'time': date_range('20130102', periods=5), @@ -1035,13 +1030,13 @@ def test_mean_corner(self): # unit test when have object data the_mean = self.mixed_frame.mean(axis=0) the_sum = self.mixed_frame.sum(axis=0, numeric_only=True) - self.assertTrue(the_sum.index.equals(the_mean.index)) + self.assert_index_equal(the_sum.index, the_mean.index) self.assertTrue(len(the_mean.index) < len(self.mixed_frame.columns)) # xs sum mixed type, just want to know it works... the_mean = self.mixed_frame.mean(axis=1) the_sum = self.mixed_frame.sum(axis=1, numeric_only=True) - self.assertTrue(the_sum.index.equals(the_mean.index)) + self.assert_index_equal(the_sum.index, the_mean.index) # take mean of boolean column self.frame['bool'] = self.frame['A'] > 0 @@ -1070,8 +1065,8 @@ def test_count_objects(self): dm = DataFrame(self.mixed_frame._series) df = DataFrame(self.mixed_frame._series) - assert_series_equal(dm.count(), df.count()) - assert_series_equal(dm.count(1), df.count(1)) + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) def test_cumsum_corner(self): dm = DataFrame(np.arange(20).reshape(4, 5), @@ -1094,9 +1089,9 @@ def test_idxmin(self): for axis in [0, 1]: for df in [frame, self.intframe]: result = df.idxmin(axis=axis, skipna=skipna) - expected = df.apply( - Series.idxmin, axis=axis, skipna=skipna) - assert_series_equal(result, expected) + expected = df.apply(Series.idxmin, axis=axis, + skipna=skipna) + tm.assert_series_equal(result, expected) self.assertRaises(ValueError, frame.idxmin, axis=2) @@ -1108,9 +1103,9 @@ def test_idxmax(self): for axis in [0, 1]: for df in [frame, self.intframe]: result = df.idxmax(axis=axis, skipna=skipna) - expected = df.apply( - Series.idxmax, axis=axis, skipna=skipna) - assert_series_equal(result, expected) + expected = df.apply(Series.idxmax, axis=axis, + skipna=skipna) + tm.assert_series_equal(result, expected) self.assertRaises(ValueError, frame.idxmax, axis=2) @@ -1169,18 +1164,18 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) - assert_series_equal(result0, frame.apply(wrapper)) - assert_series_equal(result1, frame.apply(wrapper, axis=1), - check_dtype=False) # HACK: win32 + tm.assert_series_equal(result0, frame.apply(wrapper)) + tm.assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False) # HACK: win32 else: skipna_wrapper = alternative wrapper = alternative result0 = f(axis=0) result1 = f(axis=1) - assert_series_equal(result0, frame.apply(skipna_wrapper)) - assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), - check_dtype=False) + tm.assert_series_equal(result0, frame.apply(skipna_wrapper)) + tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False) # result = f(axis=1) # comp = frame.apply(alternative, axis=1).reindex(result.index) @@ -1230,7 +1225,7 @@ def test_nlargest(self): 'b': list(ascii_lowercase[:10])}) result = df.nlargest(5, 'a') expected = df.sort_values('a', ascending=False).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nlargest_multiple_columns(self): from string import ascii_lowercase @@ -1239,7 +1234,7 @@ def test_nlargest_multiple_columns(self): 'c': np.random.permutation(10).astype('float64')}) result = df.nlargest(5, ['a', 'b']) expected = df.sort_values(['a', 'b'], ascending=False).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nsmallest(self): from string import ascii_lowercase @@ -1247,7 +1242,7 @@ def test_nsmallest(self): 'b': list(ascii_lowercase[:10])}) result = df.nsmallest(5, 'a') expected = df.sort_values('a').head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_nsmallest_multiple_columns(self): from string import ascii_lowercase @@ -1256,7 +1251,7 @@ def test_nsmallest_multiple_columns(self): 'c': np.random.permutation(10).astype('float64')}) result = df.nsmallest(5, ['a', 'c']) expected = df.sort_values(['a', 'c']).head(5) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Isin @@ -1270,13 +1265,13 @@ def test_isin(self): result = df.isin(other) expected = DataFrame([df.loc[s].isin(other) for s in df.index]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_empty(self): df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) result = df.isin([]) expected = pd.DataFrame(False, df.index, df.columns) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_dict(self): df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) @@ -1286,7 +1281,7 @@ def test_isin_dict(self): expected.loc[0, 'A'] = True result = df.isin(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # non unique columns df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']}) @@ -1294,7 +1289,7 @@ def test_isin_dict(self): expected = DataFrame(False, df.index, df.columns) expected.loc[0, 'A'] = True result = df.isin(d) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_with_string_scalar(self): # GH4763 @@ -1314,13 +1309,13 @@ def test_isin_df(self): result = df1.isin(df2) expected['A'].loc[[1, 3]] = True expected['B'].loc[[0, 2]] = True - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # partial overlapping columns df2.columns = ['A', 'C'] result = df1.isin(df2) expected['B'] = False - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_df_dupe_values(self): df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) @@ -1348,7 +1343,7 @@ def test_isin_dupe_self(self): expected = DataFrame(False, index=df.index, columns=df.columns) expected.loc[0] = True expected.iloc[1, 1] = True - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_against_series(self): df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, @@ -1358,7 +1353,7 @@ def test_isin_against_series(self): expected['A'].loc['a'] = True expected.loc['d'] = True result = df.isin(s) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_isin_multiIndex(self): idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'), @@ -1374,7 +1369,7 @@ def test_isin_multiIndex(self): # against regular index expected = DataFrame(False, index=df1.index, columns=df1.columns) result = df1.isin(df2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2.index = idx expected = df2.values.astype(np.bool) @@ -1382,7 +1377,7 @@ def test_isin_multiIndex(self): expected = DataFrame(expected, columns=['A', 'B'], index=idx) result = df1.isin(df2) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- # Row deduplication @@ -1398,43 +1393,43 @@ def test_drop_duplicates(self): # single column result = df.drop_duplicates('AAA') expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.ix[[]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('AAA', take_last=True) expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates(np.array(['AAA', 'B'])) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep='last') expected = df.ix[[0, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AAA', 'B'), keep=False) expected = df.ix[[0]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AAA', 'B'), take_last=True) expected = df.ix[[0, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # consider everything df2 = df.ix[:, ['AAA', 'B', 'C']] @@ -1442,64 +1437,64 @@ def test_drop_duplicates(self): result = df2.drop_duplicates() # in this case only expected = df2.drop_duplicates(['AAA', 'B']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep='last') expected = df2.drop_duplicates(['AAA', 'B'], keep='last') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2.drop_duplicates(keep=False) expected = df2.drop_duplicates(['AAA', 'B'], keep=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df2.drop_duplicates(take_last=True) with tm.assert_produces_warning(FutureWarning): expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # integers result = df.drop_duplicates('C') expected = df.iloc[[0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[-2, -1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df['E'] = df['C'].astype('int8') result = df.drop_duplicates('E') expected = df.iloc[[0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('E', keep='last') expected = df.iloc[[-2, -1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 11376 df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0], 'y': [0, 6, 5, 5, 9, 1, 2]}) expected = df.loc[df.index != 3] - assert_frame_equal(df.drop_duplicates(), expected) + tm.assert_frame_equal(df.drop_duplicates(), expected) df = pd.DataFrame([[1, 0], [0, 2]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) df = pd.DataFrame([[-2, 0], [0, -4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) x = np.iinfo(np.int64).max / 3 * 2 df = pd.DataFrame([[-x, x], [0, x + 4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) df = pd.DataFrame([[-x, x], [x, x + 4]]) - assert_frame_equal(df.drop_duplicates(), df) + tm.assert_frame_equal(df.drop_duplicates(), df) # GH 11864 df = pd.DataFrame([i] * 9 for i in range(16)) df = df.append([[1] + [0] * 8], ignore_index=True) for keep in ['first', 'last', False]: - assert_equal(df.duplicated(keep=keep).sum(), 0) + self.assertEqual(df.duplicated(keep=keep).sum(), 0) def test_drop_duplicates_for_take_all(self): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', @@ -1512,28 +1507,28 @@ def test_drop_duplicates_for_take_all(self): # single column result = df.drop_duplicates('AAA') expected = df.iloc[[0, 1, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep='last') expected = df.iloc[[2, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('AAA', keep=False) expected = df.iloc[[2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multiple columns result = df.drop_duplicates(['AAA', 'B']) expected = df.iloc[[0, 1, 2, 3, 4, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep='last') expected = df.iloc[[0, 1, 2, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['AAA', 'B'], keep=False) expected = df.iloc[[0, 1, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_tuple(self): df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar', @@ -1546,27 +1541,27 @@ def test_drop_duplicates_tuple(self): # single column result = df.drop_duplicates(('AA', 'AB')) expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep='last') expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(('AA', 'AB'), keep=False) expected = df.ix[[]] # empty df self.assertEqual(len(result), 0) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(('AA', 'AB'), take_last=True) expected = df.ix[[6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column expected = df.ix[[0, 1, 2, 3]] result = df.drop_duplicates((('AA', 'AB'), 'B')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA(self): # none @@ -1580,41 +1575,41 @@ def test_drop_duplicates_NA(self): # single column result = df.drop_duplicates('A') expected = df.ix[[0, 2, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.ix[[1, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.ix[[]] # empty df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('A', take_last=True) expected = df.ix[[1, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['A', 'B']) expected = df.ix[[0, 2, 3, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep='last') expected = df.ix[[1, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['A', 'B'], keep=False) expected = df.ix[[6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['A', 'B'], take_last=True) expected = df.ix[[1, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # nan df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -1627,41 +1622,41 @@ def test_drop_duplicates_NA(self): # single column result = df.drop_duplicates('C') expected = df[:2] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.ix[[3, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.ix[[]] # empty df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(result), 0) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates('C', take_last=True) expected = df.ix[[3, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column result = df.drop_duplicates(['C', 'B']) expected = df.ix[[0, 1, 2, 4]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep='last') expected = df.ix[[1, 3, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates(['C', 'B'], keep=False) expected = df.ix[[1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last with tm.assert_produces_warning(FutureWarning): result = df.drop_duplicates(['C', 'B'], take_last=True) expected = df.ix[[1, 3, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_NA_for_take_all(self): # none @@ -1672,30 +1667,30 @@ def test_drop_duplicates_NA_for_take_all(self): # single column result = df.drop_duplicates('A') expected = df.iloc[[0, 2, 3, 5, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep='last') expected = df.iloc[[1, 4, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('A', keep=False) expected = df.iloc[[5, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # nan # single column result = df.drop_duplicates('C') expected = df.iloc[[0, 1, 5, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep='last') expected = df.iloc[[3, 5, 6, 7]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.drop_duplicates('C', keep=False) expected = df.iloc[[5, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_drop_duplicates_inplace(self): orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', @@ -1710,19 +1705,19 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates('A', inplace=True) expected = orig[:2] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates('A', keep='last', inplace=True) expected = orig.ix[[6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates('A', keep=False, inplace=True) expected = orig.ix[[]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) self.assertEqual(len(df), 0) # deprecate take_last @@ -1731,26 +1726,26 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates('A', take_last=True, inplace=True) expected = orig.ix[[6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi column df = orig.copy() df.drop_duplicates(['A', 'B'], inplace=True) expected = orig.ix[[0, 1, 2, 3]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep='last', inplace=True) expected = orig.ix[[0, 5, 6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = orig.copy() df.drop_duplicates(['A', 'B'], keep=False, inplace=True) expected = orig.ix[[0]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last df = orig.copy() @@ -1758,7 +1753,7 @@ def test_drop_duplicates_inplace(self): df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) expected = orig.ix[[0, 5, 6, 7]] result = df - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # consider everything orig2 = orig.ix[:, ['A', 'B', 'C']].copy() @@ -1768,19 +1763,19 @@ def test_drop_duplicates_inplace(self): # in this case only expected = orig2.drop_duplicates(['A', 'B']) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep='last', inplace=True) expected = orig2.drop_duplicates(['A', 'B'], keep='last') result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = orig2.copy() df2.drop_duplicates(keep=False, inplace=True) expected = orig2.drop_duplicates(['A', 'B'], keep=False) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # deprecate take_last df2 = orig2.copy() @@ -1789,7 +1784,7 @@ def test_drop_duplicates_inplace(self): with tm.assert_produces_warning(FutureWarning): expected = orig2.drop_duplicates(['A', 'B'], take_last=True) result = df2 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Rounding @@ -1798,26 +1793,26 @@ def test_round(self): # Test that rounding an empty DataFrame does nothing df = DataFrame() - assert_frame_equal(df, df.round()) + tm.assert_frame_equal(df, df.round()) # Here's the test frame we'll be working with - df = DataFrame( - {'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]}) + df = DataFrame({'col1': [1.123, 2.123, 3.123], + 'col2': [1.234, 2.234, 3.234]}) # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame( {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) - assert_frame_equal(df.round(), expected_rounded) + tm.assert_frame_equal(df.round(), expected_rounded) # Round with an integer decimals = 2 - expected_rounded = DataFrame( - {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) - assert_frame_equal(df.round(decimals), expected_rounded) + expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12], + 'col2': [1.23, 2.23, 3.23]}) + tm.assert_frame_equal(df.round(decimals), expected_rounded) # This should also work with np.round (since np.round dispatches to # df.round) - assert_frame_equal(np.round(df, decimals), expected_rounded) + tm.assert_frame_equal(np.round(df, decimals), expected_rounded) # Round with a list round_list = [1, 2] @@ -1828,19 +1823,19 @@ def test_round(self): expected_rounded = DataFrame( {'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]}) round_dict = {'col1': 1, 'col2': 2} - assert_frame_equal(df.round(round_dict), expected_rounded) + tm.assert_frame_equal(df.round(round_dict), expected_rounded) # Incomplete dict expected_partially_rounded = DataFrame( {'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]}) partial_round_dict = {'col2': 1} - assert_frame_equal( - df.round(partial_round_dict), expected_partially_rounded) + tm.assert_frame_equal(df.round(partial_round_dict), + expected_partially_rounded) # Dict with unknown elements wrong_round_dict = {'col3': 2, 'col2': 1} - assert_frame_equal( - df.round(wrong_round_dict), expected_partially_rounded) + tm.assert_frame_equal(df.round(wrong_round_dict), + expected_partially_rounded) # float input to `decimals` non_int_round_dict = {'col1': 1, 'col2': 0.5} @@ -1879,8 +1874,8 @@ def test_round(self): big_df = df * 100 expected_neg_rounded = DataFrame( {'col1': [110., 210, 310], 'col2': [100., 200, 300]}) - assert_frame_equal( - big_df.round(negative_round_dict), expected_neg_rounded) + tm.assert_frame_equal(big_df.round(negative_round_dict), + expected_neg_rounded) # nan in Series round nan_round_Series = Series({'col1': nan, 'col2': 1}) @@ -1899,7 +1894,7 @@ def test_round(self): df.round(nan_round_Series) # Make sure this doesn't break existing Series.round - assert_series_equal(df['col1'].round(1), expected_rounded['col1']) + tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1']) # named columns # GH 11986 @@ -1908,20 +1903,20 @@ def test_round(self): {'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]}) df.columns.name = "cols" expected_rounded.columns.name = "cols" - assert_frame_equal(df.round(decimals), expected_rounded) + tm.assert_frame_equal(df.round(decimals), expected_rounded) # interaction of named columns & series - assert_series_equal(df['col1'].round(decimals), - expected_rounded['col1']) - assert_series_equal(df.round(decimals)['col1'], - expected_rounded['col1']) + tm.assert_series_equal(df['col1'].round(decimals), + expected_rounded['col1']) + tm.assert_series_equal(df.round(decimals)['col1'], + expected_rounded['col1']) def test_numpy_round(self): # See gh-12600 df = DataFrame([[1.53, 1.36], [0.06, 7.01]]) out = np.round(df, decimals=0) expected = DataFrame([[2., 1.], [0., 7.]]) - assert_frame_equal(out, expected) + tm.assert_frame_equal(out, expected) msg = "the 'out' parameter is not supported" with tm.assertRaisesRegexp(ValueError, msg): @@ -1935,12 +1930,12 @@ def test_round_mixed_type(self): round_0 = DataFrame({'col1': [1., 2., 3., 4.], 'col2': ['1', 'a', 'c', 'f'], 'col3': date_range('20111111', periods=4)}) - assert_frame_equal(df.round(), round_0) - assert_frame_equal(df.round(1), df) - assert_frame_equal(df.round({'col1': 1}), df) - assert_frame_equal(df.round({'col1': 0}), round_0) - assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) - assert_frame_equal(df.round({'col3': 1}), df) + tm.assert_frame_equal(df.round(), round_0) + tm.assert_frame_equal(df.round(1), df) + tm.assert_frame_equal(df.round({'col1': 1}), df) + tm.assert_frame_equal(df.round({'col1': 0}), round_0) + tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0) + tm.assert_frame_equal(df.round({'col3': 1}), df) def test_round_issue(self): # GH11611 @@ -1950,7 +1945,7 @@ def test_round_issue(self): dfs = pd.concat((df, df), axis=1) rounded = dfs.round() - self.assertTrue(rounded.index.equals(dfs.index)) + self.assert_index_equal(rounded.index, dfs.index) decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A']) self.assertRaises(ValueError, df.round, decimals) @@ -1968,7 +1963,7 @@ def test_built_in_round(self): # Default round to integer (i.e. decimals=0) expected_rounded = DataFrame( {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]}) - assert_frame_equal(round(df), expected_rounded) + tm.assert_frame_equal(round(df), expected_rounded) # Clip @@ -2015,14 +2010,14 @@ def test_clip_against_series(self): mask = ~lb_mask & ~ub_mask result = clipped_df.loc[lb_mask, i] - assert_series_equal(result, lb[lb_mask], check_names=False) + tm.assert_series_equal(result, lb[lb_mask], check_names=False) self.assertEqual(result.name, i) result = clipped_df.loc[ub_mask, i] - assert_series_equal(result, ub[ub_mask], check_names=False) + tm.assert_series_equal(result, ub[ub_mask], check_names=False) self.assertEqual(result.name, i) - assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) + tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i]) def test_clip_against_frame(self): df = DataFrame(np.random.randn(1000, 2)) @@ -2035,9 +2030,9 @@ def test_clip_against_frame(self): ub_mask = df >= ub mask = ~lb_mask & ~ub_mask - assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) - assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) - assert_frame_equal(clipped_df[mask], df[mask]) + tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask]) + tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask]) + tm.assert_frame_equal(clipped_df[mask], df[mask]) # Matrix-like @@ -2054,15 +2049,15 @@ def test_dot(self): # Check alignment b1 = b.reindex(index=reversed(b.index)) result = a.dot(b) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Check series argument result = a.dot(b['one']) - assert_series_equal(result, expected['one'], check_names=False) + tm.assert_series_equal(result, expected['one'], check_names=False) self.assertTrue(result.name is None) result = a.dot(b1['one']) - assert_series_equal(result, expected['one'], check_names=False) + tm.assert_series_equal(result, expected['one'], check_names=False) self.assertTrue(result.name is None) # can pass correct-length arrays @@ -2070,9 +2065,9 @@ def test_dot(self): result = a.dot(row) exp = a.dot(a.ix[0]) - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) - with assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): + with tm.assertRaisesRegexp(ValueError, 'Dot product shape mismatch'): a.dot(row[:-1]) a = np.random.rand(1, 5) @@ -2089,7 +2084,8 @@ def test_dot(self): df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4)) df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3]) - assertRaisesRegexp(ValueError, 'aligned', df.dot, df2) + with tm.assertRaisesRegexp(ValueError, 'aligned'): + df.dot(df2) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 09dd0f3b14812..07fe28f13b7d0 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -221,7 +221,7 @@ def test_reindex(self): # pass non-Index newFrame = self.frame.reindex(list(self.ts1.index)) - self.assertTrue(newFrame.index.equals(self.ts1.index)) + self.assert_index_equal(newFrame.index, self.ts1.index) # copy with no axes result = self.frame.reindex() @@ -381,7 +381,7 @@ def test_align(self): # axis = 0 other = self.frame.ix[:-5, :3] af, bf = self.frame.align(other, axis=0, fill_value=-1) - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) # test fill value join_idx = self.frame.index.join(other.index) diff_a = self.frame.index.difference(join_idx) @@ -391,15 +391,15 @@ def test_align(self): self.assertTrue((diff_a_vals == -1).all()) af, bf = self.frame.align(other, join='right', axis=0) - self.assertTrue(bf.columns.equals(other.columns)) - self.assertTrue(bf.index.equals(other.index)) - self.assertTrue(af.index.equals(other.index)) + self.assert_index_equal(bf.columns, other.columns) + self.assert_index_equal(bf.index, other.index) + self.assert_index_equal(af.index, other.index) # axis = 1 other = self.frame.ix[:-5, :3].copy() af, bf = self.frame.align(other, axis=1) - self.assertTrue(bf.columns.equals(self.frame.columns)) - self.assertTrue(bf.index.equals(other.index)) + self.assert_index_equal(bf.columns, self.frame.columns) + self.assert_index_equal(bf.index, other.index) # test fill value join_idx = self.frame.index.join(other.index) @@ -413,35 +413,35 @@ def test_align(self): self.assertTrue((diff_a_vals == -1).all()) af, bf = self.frame.align(other, join='inner', axis=1) - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) af, bf = self.frame.align(other, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) # test other non-float types af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(other.columns)) + self.assert_index_equal(bf.columns, other.columns) af, bf = self.mixed_frame.align(self.mixed_frame, join='inner', axis=1, method='pad') - self.assertTrue(bf.columns.equals(self.mixed_frame.columns)) + self.assert_index_equal(bf.columns, self.mixed_frame.columns) af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=None) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) af, bf = self.frame.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) # mixed floats/ints af, bf = self.mixed_float.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) af, bf = self.mixed_int.align(other.ix[:, 0], join='inner', axis=1, method=None, fill_value=0) - self.assertTrue(bf.index.equals(Index([]))) + self.assert_index_equal(bf.index, Index([])) # try to align dataframe to series along bad axis self.assertRaises(ValueError, self.frame.align, af.ix[0, :3], @@ -810,10 +810,9 @@ def test_reindex_corner(self): index = Index(['a', 'b', 'c']) dm = self.empty.reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) - self.assertTrue(reindexed.columns.equals(index)) + self.assert_index_equal(reindexed.columns, index) # ints are weird - smaller = self.intframe.reindex(columns=['A', 'B', 'E']) self.assertEqual(smaller['E'].dtype, np.float64) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index f337bf48c05ee..0421cf2ba42d2 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -505,8 +505,8 @@ def test_get_X_columns(self): 'd': [None, None, None], 'e': [3.14, 0.577, 2.773]}) - self.assert_numpy_array_equal(df._get_numeric_data().columns, - ['a', 'b', 'e']) + self.assert_index_equal(df._get_numeric_data().columns, + pd.Index(['a', 'b', 'e'])) def test_strange_column_corruption_issue(self): # (wesm) Unclear how exactly this is related to internal matters diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6913df765862d..a050d74f0fc51 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,12 +24,6 @@ import pandas as pd import pandas.core.common as com import pandas.lib as lib - -from pandas.util.testing import (assert_numpy_array_equal, - assert_series_equal, - assert_frame_equal, - assertRaisesRegexp) - import pandas.util.testing as tm from pandas.tests.frame.common import TestData @@ -171,16 +165,16 @@ def test_constructor_rec(self): index = self.frame.index df = DataFrame(rec) - self.assert_numpy_array_equal(df.columns, rec.dtype.names) + self.assert_index_equal(df.columns, pd.Index(rec.dtype.names)) df2 = DataFrame(rec, index=index) - self.assert_numpy_array_equal(df2.columns, rec.dtype.names) - self.assertTrue(df2.index.equals(index)) + self.assert_index_equal(df2.columns, pd.Index(rec.dtype.names)) + self.assert_index_equal(df2.index, index) rng = np.arange(len(rec))[::-1] df3 = DataFrame(rec, index=rng, columns=['C', 'B']) expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) - assert_frame_equal(df3, expected) + tm.assert_frame_equal(df3, expected) def test_constructor_bool(self): df = DataFrame({0: np.ones(10, dtype=bool), @@ -223,6 +217,7 @@ def test_constructor_dict(self): self.assertEqual(len(self.ts2), 25) tm.assert_series_equal(self.ts1, frame['col1'], check_names=False) + exp = pd.Series(np.concatenate([[np.nan] * 5, self.ts2.values]), index=self.ts1.index, name='col2') tm.assert_series_equal(exp, frame['col2']) @@ -245,7 +240,7 @@ def test_constructor_dict(self): # Length-one dict micro-optimization frame = DataFrame({'A': {'1': 1, '2': 2}}) - self.assert_numpy_array_equal(frame.index, ['1', '2']) + self.assert_index_equal(frame.index, pd.Index(['1', '2'])) # empty dict plus index idx = Index([0, 1, 2]) @@ -261,7 +256,7 @@ def test_constructor_dict(self): # with dict of empty list and Series frame = DataFrame({'A': [], 'B': []}, columns=['A', 'B']) - self.assertTrue(frame.index.equals(Index([]))) + self.assert_index_equal(frame.index, Index([], dtype=np.int64)) # GH10856 # dict with scalar values should raise error, even if columns passed @@ -290,37 +285,37 @@ def test_constructor_multi_index(self): def test_constructor_error_msgs(self): msg = "Empty data passed with indices specified." # passing an empty array with columns specified. - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(np.empty(0), columns=list('abc')) msg = "Mixing dicts with non-Series may lead to ambiguous ordering." # mix dict and array, wrong size - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame({'A': {'a': 'a', 'b': 'b'}, 'B': ['a', 'b', 'c']}) # wrong size ndarray, GH 3105 msg = "Shape of passed values is \(3, 4\), indices imply \(3, 3\)" - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(np.arange(12).reshape((4, 3)), columns=['foo', 'bar', 'baz'], index=pd.date_range('2000-01-01', periods=3)) # higher dim raise exception - with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + with tm.assertRaisesRegexp(ValueError, 'Must pass 2-d input'): DataFrame(np.zeros((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # wrong size axis labels - with assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(3, 1\)"): + with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " + "\(3, 2\), indices imply \(3, 1\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B', 'C'], index=[1]) - with assertRaisesRegexp(ValueError, "Shape of passed values is " - "\(3, 2\), indices imply \(2, 2\)"): + with tm.assertRaisesRegexp(ValueError, "Shape of passed values is " + "\(3, 2\), indices imply \(2, 2\)"): DataFrame(np.random.rand(2, 3), columns=['A', 'B'], index=[1, 2]) - with assertRaisesRegexp(ValueError, 'If using all scalar values, you ' - 'must pass an index'): + with tm.assertRaisesRegexp(ValueError, 'If using all scalar values, ' + 'you must pass an index'): DataFrame({'a': False, 'b': True}) def test_constructor_with_embedded_frames(self): @@ -333,10 +328,10 @@ def test_constructor_with_embedded_frames(self): str(df2) result = df2.loc[0, 0] - assert_frame_equal(result, df1) + tm.assert_frame_equal(result, df1) result = df2.loc[1, 0] - assert_frame_equal(result, df1 + 10) + tm.assert_frame_equal(result, df1 + 10) def test_constructor_subclass_dict(self): # Test for passing dict subclass to constructor @@ -345,11 +340,11 @@ def test_constructor_subclass_dict(self): df = DataFrame(data) refdf = DataFrame(dict((col, dict(compat.iteritems(val))) for col, val in compat.iteritems(data))) - assert_frame_equal(refdf, df) + tm.assert_frame_equal(refdf, df) data = tm.TestSubDict(compat.iteritems(data)) df = DataFrame(data) - assert_frame_equal(refdf, df) + tm.assert_frame_equal(refdf, df) # try with defaultdict from collections import defaultdict @@ -360,10 +355,10 @@ def test_constructor_subclass_dict(self): dct.update(v.to_dict()) data[k] = dct frame = DataFrame(data) - assert_frame_equal(self.frame.sort_index(), frame) + tm.assert_frame_equal(self.frame.sort_index(), frame) def test_constructor_dict_block(self): - expected = [[4., 3., 2., 1.]] + expected = np.array([[4., 3., 2., 1.]]) df = DataFrame({'d': [4.], 'c': [3.], 'b': [2.], 'a': [1.]}, columns=['d', 'c', 'b', 'a']) tm.assert_numpy_array_equal(df.values, expected) @@ -409,10 +404,10 @@ def test_constructor_dict_of_tuples(self): result = DataFrame(data) expected = DataFrame(dict((k, list(v)) for k, v in compat.iteritems(data))) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_dict_multiindex(self): - check = lambda result, expected: assert_frame_equal( + check = lambda result, expected: tm.assert_frame_equal( result, expected, check_dtype=True, check_index_type=True, check_column_type=True, check_names=True) d = {('a', 'a'): {('i', 'i'): 0, ('i', 'j'): 1, ('j', 'i'): 2}, @@ -457,9 +452,9 @@ def create_data(constructor): result_datetime64 = DataFrame(data_datetime64) result_datetime = DataFrame(data_datetime) result_Timestamp = DataFrame(data_Timestamp) - assert_frame_equal(result_datetime64, expected) - assert_frame_equal(result_datetime, expected) - assert_frame_equal(result_Timestamp, expected) + tm.assert_frame_equal(result_datetime64, expected) + tm.assert_frame_equal(result_datetime, expected) + tm.assert_frame_equal(result_Timestamp, expected) def test_constructor_dict_timedelta64_index(self): # GH 10160 @@ -482,9 +477,9 @@ def create_data(constructor): result_timedelta64 = DataFrame(data_timedelta64) result_timedelta = DataFrame(data_timedelta) result_Timedelta = DataFrame(data_Timedelta) - assert_frame_equal(result_timedelta64, expected) - assert_frame_equal(result_timedelta, expected) - assert_frame_equal(result_Timedelta, expected) + tm.assert_frame_equal(result_timedelta64, expected) + tm.assert_frame_equal(result_timedelta, expected) + tm.assert_frame_equal(result_Timedelta, expected) def test_constructor_period(self): # PeriodIndex @@ -510,7 +505,7 @@ def test_nested_dict_frame_constructor(self): data.setdefault(col, {})[row] = df.get_value(row, col) result = DataFrame(data, columns=rng) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) data = {} for col in df.columns: @@ -518,7 +513,7 @@ def test_nested_dict_frame_constructor(self): data.setdefault(row, {})[col] = df.get_value(row, col) result = DataFrame(data, index=rng).T - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def _check_basic_constructor(self, empty): # mat: 2d matrix with shpae (3, 2) to input. empty - makes sized @@ -542,27 +537,27 @@ def _check_basic_constructor(self, empty): # wrong size axis labels msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(mat, columns=['A', 'B', 'C'], index=[1]) msg = r'Shape of passed values is \(3, 2\), indices imply \(2, 2\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame(mat, columns=['A', 'B'], index=[1, 2]) # higher dim raise exception - with assertRaisesRegexp(ValueError, 'Must pass 2-d input'): + with tm.assertRaisesRegexp(ValueError, 'Must pass 2-d input'): DataFrame(empty((3, 3, 3)), columns=['A', 'B', 'C'], index=[1]) # automatic labeling frame = DataFrame(mat) - self.assert_numpy_array_equal(frame.index, lrange(2)) - self.assert_numpy_array_equal(frame.columns, lrange(3)) + self.assert_index_equal(frame.index, pd.Index(lrange(2))) + self.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, index=[1, 2]) - self.assert_numpy_array_equal(frame.columns, lrange(3)) + self.assert_index_equal(frame.columns, pd.Index(lrange(3))) frame = DataFrame(mat, columns=['A', 'B', 'C']) - self.assert_numpy_array_equal(frame.index, lrange(2)) + self.assert_index_equal(frame.index, pd.Index(lrange(2))) # 0-length axis frame = DataFrame(empty((0, 3))) @@ -664,7 +659,7 @@ def test_constructor_mrecarray(self): # Ensure mrecarray produces frame identical to dict of masked arrays # from GH3479 - assert_fr_equal = functools.partial(assert_frame_equal, + assert_fr_equal = functools.partial(tm.assert_frame_equal, check_index_type=True, check_column_type=True, check_frame_type=True) @@ -738,13 +733,13 @@ def test_constructor_arrays_and_scalars(self): df = DataFrame({'a': randn(10), 'b': True}) exp = DataFrame({'a': df['a'].values, 'b': [True] * 10}) - assert_frame_equal(df, exp) + tm.assert_frame_equal(df, exp) with tm.assertRaisesRegexp(ValueError, 'must pass an index'): DataFrame({'a': False, 'b': True}) def test_constructor_DataFrame(self): df = DataFrame(self.frame) - assert_frame_equal(df, self.frame) + tm.assert_frame_equal(df, self.frame) df_casted = DataFrame(self.frame, dtype=np.int64) self.assertEqual(df_casted.values.dtype, np.int64) @@ -772,17 +767,17 @@ def test_constructor_more(self): # corner, silly # TODO: Fix this Exception to be better... - with assertRaisesRegexp(PandasError, 'constructor not ' - 'properly called'): + with tm.assertRaisesRegexp(PandasError, 'constructor not ' + 'properly called'): DataFrame((1, 2, 3)) # can't cast mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1) - with assertRaisesRegexp(ValueError, 'cast'): + with tm.assertRaisesRegexp(ValueError, 'cast'): DataFrame(mat, index=[0, 1], columns=[0], dtype=float) dm = DataFrame(DataFrame(self.frame._series)) - assert_frame_equal(dm, self.frame) + tm.assert_frame_equal(dm, self.frame) # int cast dm = DataFrame({'A': np.ones(10, dtype=int), @@ -795,12 +790,12 @@ def test_constructor_more(self): def test_constructor_empty_list(self): df = DataFrame([], index=[]) expected = DataFrame(index=[]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 9939 df = DataFrame([], columns=['A', 'B']) expected = DataFrame({}, columns=['A', 'B']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # Empty generator: list(empty_gen()) == [] def empty_gen(): @@ -808,7 +803,7 @@ def empty_gen(): yield df = DataFrame(empty_gen(), columns=['A', 'B']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_list_of_lists(self): # GH #484 @@ -822,7 +817,7 @@ def test_constructor_list_of_lists(self): expected = DataFrame({0: range(10)}) data = [np.array(x) for x in range(10)] result = DataFrame(data) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_sequence_like(self): # GH 3783 @@ -844,25 +839,25 @@ def __len__(self, n): columns = ["num", "str"] result = DataFrame(l, columns=columns) expected = DataFrame([[1, 'a'], [2, 'b']], columns=columns) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) # GH 4297 # support Array import array result = DataFrame.from_items([('A', array.array('i', range(10)))]) expected = DataFrame({'A': list(range(10))}) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([array.array('i', range(10)), array.array('i', range(10))]) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_iterator(self): expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([range(10), range(10)]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_generator(self): # related #2305 @@ -872,12 +867,12 @@ def test_constructor_generator(self): expected = DataFrame([list(range(10)), list(range(10))]) result = DataFrame([gen1, gen2]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) gen = ([i, 'a'] for i in range(10)) result = DataFrame(gen) expected = DataFrame({0: range(10), 1: 'a'}) - assert_frame_equal(result, expected, check_dtype=False) + tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_list_of_dicts(self): data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), @@ -890,11 +885,11 @@ def test_constructor_list_of_dicts(self): result = DataFrame(data) expected = DataFrame.from_dict(dict(zip(range(len(data)), data)), orient='index') - assert_frame_equal(result, expected.reindex(result.index)) + tm.assert_frame_equal(result, expected.reindex(result.index)) result = DataFrame([{}]) expected = DataFrame(index=[0]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_list_of_series(self): data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), @@ -907,7 +902,7 @@ def test_constructor_list_of_series(self): Series([1.5, 3, 6], idx, name='y')] result = DataFrame(data2) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # some unnamed data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), @@ -916,7 +911,7 @@ def test_constructor_list_of_series(self): sdict = OrderedDict(zip(['x', 'Unnamed 0'], data)) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result.sort_index(), expected) + tm.assert_frame_equal(result.sort_index(), expected) # none named data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]), @@ -930,14 +925,14 @@ def test_constructor_list_of_series(self): result = DataFrame(data) sdict = OrderedDict(zip(range(len(data)), data)) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected.reindex(result.index)) + tm.assert_frame_equal(result, expected.reindex(result.index)) result2 = DataFrame(data, index=np.arange(6)) - assert_frame_equal(result, result2) + tm.assert_frame_equal(result, result2) result = DataFrame([Series({})]) expected = DataFrame(index=[0]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] @@ -948,7 +943,7 @@ def test_constructor_list_of_series(self): Series([1.5, 3, 6], idx)] result = DataFrame(data2) expected = DataFrame.from_dict(sdict, orient='index') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_list_of_derived_dicts(self): class CustomDict(dict): @@ -960,19 +955,20 @@ class CustomDict(dict): result_custom = DataFrame(data_custom) result = DataFrame(data) - assert_frame_equal(result, result_custom) + tm.assert_frame_equal(result, result_custom) def test_constructor_ragged(self): data = {'A': randn(10), 'B': randn(8)} - with assertRaisesRegexp(ValueError, 'arrays must all be same length'): + with tm.assertRaisesRegexp(ValueError, + 'arrays must all be same length'): DataFrame(data) def test_constructor_scalar(self): idx = Index(lrange(3)) df = DataFrame({"a": 0}, index=idx) expected = DataFrame({"a": [0, 0, 0]}, index=idx) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) def test_constructor_Series_copy_bug(self): df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) @@ -987,7 +983,7 @@ def test_constructor_mixed_dict_and_Series(self): self.assertTrue(result.index.is_monotonic) # ordering ambiguous, raise exception - with assertRaisesRegexp(ValueError, 'ambiguous ordering'): + with tm.assertRaisesRegexp(ValueError, 'ambiguous ordering'): DataFrame({'A': ['a', 'b'], 'B': {'a': 'a', 'b': 'b'}}) # this is OK though @@ -995,12 +991,12 @@ def test_constructor_mixed_dict_and_Series(self): 'B': Series(['a', 'b'], index=['a', 'b'])}) expected = DataFrame({'A': ['a', 'b'], 'B': ['a', 'b']}, index=['a', 'b']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_tuples(self): result = DataFrame({'A': [(1, 2), (3, 4)]}) expected = DataFrame({'A': Series([(1, 2), (3, 4)])}) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_namedtuples(self): # GH11181 @@ -1009,43 +1005,43 @@ def test_constructor_namedtuples(self): tuples = [named_tuple(1, 3), named_tuple(2, 4)] expected = DataFrame({'a': [1, 2], 'b': [3, 4]}) result = DataFrame(tuples) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # with columns expected = DataFrame({'y': [1, 2], 'z': [3, 4]}) result = DataFrame(tuples, columns=['y', 'z']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_constructor_orient(self): data_dict = self.mixed_frame.T._series recons = DataFrame.from_dict(data_dict, orient='index') expected = self.mixed_frame.sort_index() - assert_frame_equal(recons, expected) + tm.assert_frame_equal(recons, expected) # dict of sequence a = {'hi': [32, 3, 3], 'there': [3, 5, 3]} rs = DataFrame.from_dict(a, orient='index') xp = DataFrame.from_dict(a).T.reindex(list(a.keys())) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_constructor_Series_named(self): a = Series([1, 2, 3], index=['a', 'b', 'c'], name='x') df = DataFrame(a) self.assertEqual(df.columns[0], 'x') - self.assertTrue(df.index.equals(a.index)) + self.assert_index_equal(df.index, a.index) # ndarray like arr = np.random.randn(10) s = Series(arr, name='x') df = DataFrame(s) expected = DataFrame(dict(x=s)) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) s = Series(arr, index=range(3, 13)) df = DataFrame(s) expected = DataFrame({0: s}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) self.assertRaises(ValueError, DataFrame, s, columns=[1, 2]) @@ -1059,12 +1055,12 @@ def test_constructor_Series_named(self): df = DataFrame([s1, arr]).T expected = DataFrame({'x': s1, 'Unnamed 0': arr}, columns=['x', 'Unnamed 0']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # this is a bit non-intuitive here; the series collapse down to arrays df = DataFrame([arr, s1]).T expected = DataFrame({1: s1, 0: arr}, columns=[0, 1]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_Series_differently_indexed(self): # name @@ -1078,13 +1074,13 @@ def test_constructor_Series_differently_indexed(self): df1 = DataFrame(s1, index=other_index) exp1 = DataFrame(s1.reindex(other_index)) self.assertEqual(df1.columns[0], 'x') - assert_frame_equal(df1, exp1) + tm.assert_frame_equal(df1, exp1) df2 = DataFrame(s2, index=other_index) exp2 = DataFrame(s2.reindex(other_index)) self.assertEqual(df2.columns[0], 0) - self.assertTrue(df2.index.equals(other_index)) - assert_frame_equal(df2, exp2) + self.assert_index_equal(df2.index, other_index) + tm.assert_frame_equal(df2, exp2) def test_constructor_manager_resize(self): index = list(self.frame.index[:5]) @@ -1092,17 +1088,17 @@ def test_constructor_manager_resize(self): result = DataFrame(self.frame._data, index=index, columns=columns) - self.assert_numpy_array_equal(result.index, index) - self.assert_numpy_array_equal(result.columns, columns) + self.assert_index_equal(result.index, Index(index)) + self.assert_index_equal(result.columns, Index(columns)) def test_constructor_from_items(self): items = [(c, self.frame[c]) for c in self.frame.columns] recons = DataFrame.from_items(items) - assert_frame_equal(recons, self.frame) + tm.assert_frame_equal(recons, self.frame) # pass some columns recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) - assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + tm.assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) # orient='index' @@ -1112,7 +1108,7 @@ def test_constructor_from_items(self): recons = DataFrame.from_items(row_items, columns=self.mixed_frame.columns, orient='index') - assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, self.mixed_frame) self.assertEqual(recons['A'].dtype, np.float64) with tm.assertRaisesRegexp(TypeError, @@ -1128,7 +1124,7 @@ def test_constructor_from_items(self): recons = DataFrame.from_items(row_items, columns=self.mixed_frame.columns, orient='index') - assert_frame_equal(recons, self.mixed_frame) + tm.assert_frame_equal(recons, self.mixed_frame) tm.assertIsInstance(recons['foo'][0], tuple) rs = DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], @@ -1136,12 +1132,12 @@ def test_constructor_from_items(self): columns=['one', 'two', 'three']) xp = DataFrame([[1, 2, 3], [4, 5, 6]], index=['A', 'B'], columns=['one', 'two', 'three']) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_constructor_mix_series_nonseries(self): df = DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])}, columns=['A', 'B']) - assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) + tm.assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) with tm.assertRaisesRegexp(ValueError, 'does not match index length'): DataFrame({'A': self.frame['A'], 'B': list(self.frame['B'])[:-2]}) @@ -1149,10 +1145,10 @@ def test_constructor_mix_series_nonseries(self): def test_constructor_miscast_na_int_dtype(self): df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) expected = DataFrame([[np.nan, 1], [1, 0]]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_iterator_failure(self): - with assertRaisesRegexp(TypeError, 'iterator'): + with tm.assertRaisesRegexp(TypeError, 'iterator'): df = DataFrame(iter([1, 2, 3])) # noqa def test_constructor_column_duplicates(self): @@ -1161,11 +1157,11 @@ def test_constructor_column_duplicates(self): edf = DataFrame([[8, 5]]) edf.columns = ['a', 'a'] - assert_frame_equal(df, edf) + tm.assert_frame_equal(df, edf) idf = DataFrame.from_items( [('a', [8]), ('a', [5])], columns=['a', 'a']) - assert_frame_equal(idf, edf) + tm.assert_frame_equal(idf, edf) self.assertRaises(ValueError, DataFrame.from_items, [('a', [8]), ('a', [5]), ('b', [6])], @@ -1176,30 +1172,29 @@ def test_constructor_empty_with_string_dtype(self): expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.unicode_) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype='U5') - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_single_value(self): # expecting single value upcasting here df = DataFrame(0., index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('float64'), - df.index, df.columns)) + tm.assert_frame_equal(df, + DataFrame(np.zeros(df.shape).astype('float64'), + df.index, df.columns)) df = DataFrame(0, index=[1, 2, 3], columns=['a', 'b', 'c']) - assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), - df.index, df.columns)) + tm.assert_frame_equal(df, DataFrame(np.zeros(df.shape).astype('int64'), + df.index, df.columns)) df = DataFrame('a', index=[1, 2], columns=['a', 'c']) - assert_frame_equal(df, DataFrame(np.array([['a', 'a'], - ['a', 'a']], - dtype=object), - index=[1, 2], - columns=['a', 'c'])) + tm.assert_frame_equal(df, DataFrame(np.array([['a', 'a'], ['a', 'a']], + dtype=object), + index=[1, 2], columns=['a', 'c'])) self.assertRaises(com.PandasError, DataFrame, 'a', [1, 2]) self.assertRaises(com.PandasError, DataFrame, 'a', columns=['a', 'c']) @@ -1221,7 +1216,7 @@ def test_constructor_with_datetimes(self): expected = Series({'int64': 1, datetime64name: 2, objectname: 2}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check with ndarray construction ndim==0 (e.g. we are passing a ndim 0 # ndarray with a dtype specified) @@ -1245,7 +1240,7 @@ def test_constructor_with_datetimes(self): result.sort_index() expected = Series(expected) expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check with ndarray construction ndim>0 df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', @@ -1254,7 +1249,7 @@ def test_constructor_with_datetimes(self): index=np.arange(10)) result = df.get_dtype_counts() result.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 2809 ind = date_range(start="2000-01-01", freq="D", periods=10) @@ -1266,7 +1261,7 @@ def test_constructor_with_datetimes(self): expected = Series({datetime64name: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 2810 ind = date_range(start="2000-01-01", freq="D", periods=10) @@ -1277,7 +1272,7 @@ def test_constructor_with_datetimes(self): expected = Series({datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 7594 # don't coerce tz-aware @@ -1287,12 +1282,12 @@ def test_constructor_with_datetimes(self): df = DataFrame({'End Date': dt}, index=[0]) self.assertEqual(df.iat[0, 0], dt) - assert_series_equal(df.dtypes, Series( + tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) df = DataFrame([{'End Date': dt}]) self.assertEqual(df.iat[0, 0], dt) - assert_series_equal(df.dtypes, Series( + tm.assert_series_equal(df.dtypes, Series( {'End Date': 'datetime64[ns, US/Eastern]'})) # tz-aware (UTC and other tz's) @@ -1315,17 +1310,17 @@ def test_constructor_with_datetimes(self): {'a': i.to_series(keep_tz=True).reset_index(drop=True)}) df = DataFrame() df['a'] = i - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'a': i}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # multiples i_no_tz = date_range('1/1/2011', periods=5, freq='10s') df = DataFrame({'a': i, 'b': i_no_tz}) expected = DataFrame({'a': i.to_series(keep_tz=True) .reset_index(drop=True), 'b': i_no_tz}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_constructor_for_list_with_dtypes(self): # TODO(wesm): unused @@ -1348,39 +1343,39 @@ def test_constructor_for_list_with_dtypes(self): df = DataFrame({'a': [2 ** 31, 2 ** 31 + 1]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH #2751 (construction with no index specified), make sure we cast to # platform values df = DataFrame([1, 2]) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame([1., 2.]) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': [1, 2]}) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': [1., 2.]}) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': 1}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'int64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = DataFrame({'a': 1.}, index=lrange(3)) result = df.get_dtype_counts() expected = Series({'float64': 1}) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # with object list df = DataFrame({'a': [1, 2, 4, 7], 'b': [1.2, 2.3, 5.1, 6.3], @@ -1392,7 +1387,7 @@ def test_constructor_for_list_with_dtypes(self): {'int64': 1, 'float64': 2, datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_constructor_frame_copy(self): cop = DataFrame(self.frame, copy=True) @@ -1430,7 +1425,8 @@ def check(df): indexer = np.arange(len(df.columns))[isnull(df.columns)] if len(indexer) == 1: - assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan]) + tm.assert_series_equal(df.iloc[:, indexer[0]], + df.loc[:, np.nan]) # multiple nans should fail else: @@ -1467,17 +1463,17 @@ def test_from_records_to_records(self): # TODO(wesm): unused frame = DataFrame.from_records(arr) # noqa - index = np.arange(len(arr))[::-1] + index = pd.Index(np.arange(len(arr))[::-1]) indexed_frame = DataFrame.from_records(arr, index=index) - self.assert_numpy_array_equal(indexed_frame.index, index) + self.assert_index_equal(indexed_frame.index, index) # without names, it should go to last ditch arr2 = np.zeros((2, 3)) - assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) + tm.assert_frame_equal(DataFrame.from_records(arr2), DataFrame(arr2)) # wrong length msg = r'Shape of passed values is \(3, 2\), indices imply \(3, 1\)' - with assertRaisesRegexp(ValueError, msg): + with tm.assertRaisesRegexp(ValueError, msg): DataFrame.from_records(arr, index=index[:-1]) indexed_frame = DataFrame.from_records(arr, index='f1') @@ -1508,14 +1504,14 @@ def test_from_records_iterator(self): 'u': np.array([1.0, 3.0], dtype=np.float32), 'y': np.array([2, 4], dtype=np.int64), 'z': np.array([2, 4], dtype=np.int32)}) - assert_frame_equal(df.reindex_like(xp), xp) + tm.assert_frame_equal(df.reindex_like(xp), xp) # no dtypes specified here, so just compare with the default arr = [(1.0, 2), (3.0, 4), (5., 6), (7., 8)] df = DataFrame.from_records(iter(arr), columns=['x', 'y'], nrows=2) - assert_frame_equal(df, xp.reindex( - columns=['x', 'y']), check_dtype=False) + tm.assert_frame_equal(df, xp.reindex(columns=['x', 'y']), + check_dtype=False) def test_from_records_tuples_generator(self): def tuple_generator(length): @@ -1532,7 +1528,7 @@ def tuple_generator(length): generator = tuple_generator(10) result = DataFrame.from_records(generator, columns=columns_names) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_lists_generator(self): def list_generator(length): @@ -1549,7 +1545,7 @@ def list_generator(length): generator = list_generator(10) result = DataFrame.from_records(generator, columns=columns_names) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_columns_not_modified(self): tuples = [(1, 2, 3), @@ -1582,7 +1578,7 @@ def test_from_records_duplicates(self): expected = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'a']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_set_index_name(self): def create_dict(order_id): @@ -1607,7 +1603,7 @@ def test_from_records_misc_brokenness(self): result = DataFrame.from_records(data, columns=['a', 'b']) exp = DataFrame(data, columns=['a', 'b']) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) # overlap in index/index_names @@ -1615,7 +1611,7 @@ def test_from_records_misc_brokenness(self): result = DataFrame.from_records(data, index=['a', 'b', 'c']) exp = DataFrame(data, index=['a', 'b', 'c']) - assert_frame_equal(result, exp) + tm.assert_frame_equal(result, exp) # GH 2623 rows = [] @@ -1631,28 +1627,28 @@ def test_from_records_misc_brokenness(self): df2_obj = DataFrame.from_records(rows, columns=['date', 'test']) results = df2_obj.get_dtype_counts() expected = Series({'datetime64[ns]': 1, 'int64': 1}) - assert_series_equal(results, expected) + tm.assert_series_equal(results, expected) def test_from_records_empty(self): # 3562 result = DataFrame.from_records([], columns=['a', 'b', 'c']) expected = DataFrame(columns=['a', 'b', 'c']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = DataFrame.from_records([], columns=['a', 'b', 'b']) expected = DataFrame(columns=['a', 'b', 'b']) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_from_records_empty_with_nonempty_fields_gh3682(self): a = np.array([(1, 2)], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(a, index='id') - assert_numpy_array_equal(df.index, Index([1], name='id')) + tm.assert_index_equal(df.index, Index([1], name='id')) self.assertEqual(df.index.name, 'id') - assert_numpy_array_equal(df.columns, Index(['value'])) + tm.assert_index_equal(df.columns, Index(['value'])) b = np.array([], dtype=[('id', np.int64), ('value', np.int64)]) df = DataFrame.from_records(b, index='id') - assert_numpy_array_equal(df.index, Index([], name='id')) + tm.assert_index_equal(df.index, Index([], name='id')) self.assertEqual(df.index.name, 'id') def test_from_records_with_datetimes(self): @@ -1675,14 +1671,14 @@ def test_from_records_with_datetimes(self): raise nose.SkipTest("known failure of numpy rec array creation") result = DataFrame.from_records(recarray) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # coercion should work too arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] dtypes = [('EXPIRY', ' 'a'] expected = self.factor[np.asarray(self.factor) > 'a'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor >= 'b'] expected = self.factor[np.asarray(self.factor) >= 'b'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) result = self.factor[self.factor <= 'b'] expected = self.factor[np.asarray(self.factor) <= 'b'] - self.assertTrue(result.equals(expected)) + tm.assert_categorical_equal(result, expected) n = len(self.factor) @@ -551,7 +554,7 @@ def test_na_flags_int_categories(self): def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], ordered=True) - self.assertTrue(factor.equals(self.factor)) + tm.assert_categorical_equal(factor, self.factor) def test_describe(self): # string type @@ -710,7 +713,7 @@ def test_periodindex(self): exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1._codes, exp_arr) - self.assertTrue(cat1.categories.equals(exp_idx)) + self.assert_index_equal(cat1.categories, exp_idx) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') @@ -719,7 +722,7 @@ def test_periodindex(self): exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat2._codes, exp_arr) - self.assertTrue(cat2.categories.equals(exp_idx2)) + self.assert_index_equal(cat2.categories, exp_idx2) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') @@ -728,15 +731,14 @@ def test_periodindex(self): exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3._codes, exp_arr) - self.assertTrue(cat3.categories.equals(exp_idx)) + self.assert_index_equal(cat3.categories, exp_idx) def test_categories_assigments(self): s = pd.Categorical(["a", "b", "c", "a"]) exp = np.array([1, 2, 3, 1], dtype=np.int64) s.categories = [1, 2, 3] self.assert_numpy_array_equal(s.__array__(), exp) - self.assert_numpy_array_equal(s.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(s.categories, Index([1, 2, 3])) # lengthen def f(): @@ -762,21 +764,21 @@ def test_construction_with_ordered(self): def test_ordered_api(self): # GH 9347 cat1 = pd.Categorical(["a", "c", "b"], ordered=False) - self.assertTrue(cat1.categories.equals(Index(['a', 'b', 'c']))) + self.assert_index_equal(cat1.categories, Index(['a', 'b', 'c'])) self.assertFalse(cat1.ordered) cat2 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=False) - self.assertTrue(cat2.categories.equals(Index(['b', 'c', 'a']))) + self.assert_index_equal(cat2.categories, Index(['b', 'c', 'a'])) self.assertFalse(cat2.ordered) cat3 = pd.Categorical(["a", "c", "b"], ordered=True) - self.assertTrue(cat3.categories.equals(Index(['a', 'b', 'c']))) + self.assert_index_equal(cat3.categories, Index(['a', 'b', 'c'])) self.assertTrue(cat3.ordered) cat4 = pd.Categorical(["a", "c", "b"], categories=['b', 'c', 'a'], ordered=True) - self.assertTrue(cat4.categories.equals(Index(['b', 'c', 'a']))) + self.assert_index_equal(cat4.categories, Index(['b', 'c', 'a'])) self.assertTrue(cat4.ordered) def test_set_ordered(self): @@ -808,21 +810,21 @@ def test_set_ordered(self): def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) - exp_categories = np.array(["c", "b", "a"], dtype=np.object_) + exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) res = cat.set_categories(["c", "b", "a"], inplace=True) - self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_index_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) self.assertIsNone(res) res = cat.set_categories(["a", "b", "c"]) # cat must be the same as before - self.assert_numpy_array_equal(cat.categories, exp_categories) + self.assert_index_equal(cat.categories, exp_categories) self.assert_numpy_array_equal(cat.__array__(), exp_values) # only res is changed - exp_categories_back = np.array(["a", "b", "c"]) - self.assert_numpy_array_equal(res.categories, exp_categories_back) + exp_categories_back = Index(["a", "b", "c"]) + self.assert_index_equal(res.categories, exp_categories_back) self.assert_numpy_array_equal(res.__array__(), exp_values) # not all "old" included in "new" -> all not included ones are now @@ -836,19 +838,18 @@ def test_set_categories(self): res = cat.set_categories(["a", "b", "d"]) self.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) - self.assert_numpy_array_equal(res.categories, - np.array(["a", "b", "d"])) + self.assert_index_equal(res.categories, Index(["a", "b", "d"])) # all "old" included in "new" cat = cat.set_categories(["a", "b", "c", "d"]) - exp_categories = np.array(["a", "b", "c", "d"], dtype=np.object_) - self.assert_numpy_array_equal(cat.categories, exp_categories) + exp_categories = Index(["a", "b", "c", "d"]) + self.assert_index_equal(cat.categories, exp_categories) # internals... c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) - self.assert_numpy_array_equal(c.categories, np.array([1, 2, 3, 4])) + self.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) self.assert_numpy_array_equal(c.get_values(), exp) @@ -861,7 +862,7 @@ def test_set_categories(self): np.array([3, 2, 1, 0, 3], dtype=np.int8)) # categories are now in new order - self.assert_numpy_array_equal(c.categories, np.array([4, 3, 2, 1])) + self.assert_index_equal(c.categories, Index([4, 3, 2, 1])) # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) @@ -886,22 +887,20 @@ def test_rename_categories(self): res = cat.rename_categories([1, 2, 3]) self.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) - self.assert_numpy_array_equal(res.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) self.assert_numpy_array_equal(cat.__array__(), exp_cat) - exp_cat = np.array(["a", "b", "c"], dtype=np.object_) - self.assert_numpy_array_equal(cat.categories, exp_cat) + exp_cat = Index(["a", "b", "c"]) + self.assert_index_equal(cat.categories, exp_cat) res = cat.rename_categories([1, 2, 3], inplace=True) # and now inplace self.assertIsNone(res) self.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) - self.assert_numpy_array_equal(cat.categories, - np.array([1, 2, 3], dtype=np.int64)) + self.assert_index_equal(cat.categories, Index([1, 2, 3])) # lengthen def f(): @@ -1025,14 +1024,14 @@ def test_remove_unused_categories(self): exp_categories_all = Index(["a", "b", "c", "d", "e"]) exp_categories_dropped = Index(["a", "b", "c", "d"]) - self.assert_numpy_array_equal(c.categories, exp_categories_all) + self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() self.assert_index_equal(res.categories, exp_categories_dropped) self.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) - self.assert_numpy_array_equal(c.categories, exp_categories_dropped) + self.assert_index_equal(c.categories, exp_categories_dropped) self.assertIsNone(res) # with NaN values (GH11599) @@ -1065,11 +1064,11 @@ def test_nan_handling(self): # Nans are represented as -1 in codes c = Categorical(["a", "b", np.nan, "a"]) - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8)) @@ -1078,15 +1077,11 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning): c = Categorical(["a", "b", np.nan, "a"], categories=["a", "b", np.nan]) - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 2, 2, 0], dtype=np.int8)) @@ -1095,30 +1090,24 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning): c.categories = ["a", "b", np.nan] # noqa - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 0], dtype=np.int8)) # Adding nan to categories should make assigned nan point to the # category! c = Categorical(["a", "b", np.nan, "a"]) - self.assert_numpy_array_equal(c.categories, np.array(["a", "b"])) + self.assert_index_equal(c.categories, Index(["a", "b"])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) with tm.assert_produces_warning(FutureWarning): c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8)) c[1] = np.nan - self.assert_numpy_array_equal(c.categories, - np.array(["a", "b", np.nan], - dtype=np.object_)) + self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) self.assert_numpy_array_equal(c._codes, np.array([0, 2, -1, 0], dtype=np.int8)) @@ -1244,63 +1233,58 @@ def test_min_max(self): def test_unique(self): # categories are reordered based on value when ordered=False cat = Categorical(["a", "b"]) - exp = np.asarray(["a", "b"]) + exp = Index(["a", "b"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res.categories, exp) + self.assert_categorical_equal(res, cat) cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, Categorical(exp)) cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) - exp = np.asarray(["c", "a", "b"]) + exp = Index(["c", "a", "b"]) res = cat.unique() - self.assert_numpy_array_equal(res, exp) - tm.assert_categorical_equal(res, Categorical( - exp, categories=['c', 'a', 'b'])) + self.assert_index_equal(res.categories, exp) + exp_cat = Categorical(exp, categories=['c', 'a', 'b']) + tm.assert_categorical_equal(res, exp_cat) # nan must be removed cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) res = cat.unique() - exp = np.asarray(["b", np.nan, "a"], dtype=object) - self.assert_numpy_array_equal(res, exp) - tm.assert_categorical_equal(res, Categorical( - ["b", np.nan, "a"], categories=["b", "a"])) + exp = Index(["b", "a"]) + self.assert_index_equal(res.categories, exp) + exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) + tm.assert_categorical_equal(res, exp_cat) def test_unique_ordered(self): # keep categories order when ordered=True cat = Categorical(['b', 'a', 'b'], categories=['a', 'b'], ordered=True) res = cat.unique() - exp = np.asarray(['b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['c', 'b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['c', 'b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b', 'c'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['c', 'b', 'a'], categories=['a', 'b', 'c'], + ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['b', 'a', 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['b', 'a']) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', 'a'], categories=['a', 'b'], ordered=True) tm.assert_categorical_equal(res, exp_cat) cat = Categorical(['b', 'b', np.nan, 'a'], categories=['a', 'b', 'c'], ordered=True) res = cat.unique() - exp = np.asarray(['b', np.nan, 'a'], dtype=object) - exp_cat = Categorical(exp, categories=['a', 'b'], ordered=True) - self.assert_numpy_array_equal(res, exp) + exp_cat = Categorical(['b', np.nan, 'a'], categories=['a', 'b'], + ordered=True) tm.assert_categorical_equal(res, exp_cat) def test_mode(self): @@ -1308,33 +1292,33 @@ def test_mode(self): ordered=True) res = s.mode() exp = Categorical([5], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([1, 1, 1, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([5, 1], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) # NaN should not become the mode! s = Categorical([np.nan, np.nan, np.nan, 4, 5], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([np.nan, np.nan, np.nan, 4, 5, 4], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) s = Categorical([np.nan, np.nan, 4, 5, 4], categories=[5, 4, 3, 2, 1], ordered=True) res = s.mode() exp = Categorical([4], categories=[5, 4, 3, 2, 1], ordered=True) - self.assertTrue(res.equals(exp)) + tm.assert_categorical_equal(res, exp) def test_sort_values(self): @@ -1348,74 +1332,78 @@ def test_sort_values(self): res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) cat = Categorical(["a", "c", "b", "d"], categories=["a", "b", "c", "d"], ordered=True) res = cat.sort_values() exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) res = cat.sort_values(ascending=False) exp = np.array(["d", "c", "b", "a"], dtype=object) self.assert_numpy_array_equal(res.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) # sort (inplace order) cat1 = cat.copy() cat1.sort_values(inplace=True) exp = np.array(["a", "b", "c", "d"], dtype=object) self.assert_numpy_array_equal(cat1.__array__(), exp) + self.assert_index_equal(res.categories, cat.categories) # reverse cat = Categorical(["a", "c", "c", "b", "d"], ordered=True) res = cat.sort_values(ascending=False) exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) def test_sort_values_na_position(self): # see gh-12882 cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True) - exp_categories = np.array([2, 5]) + exp_categories = Index([2, 5]) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) res = cat.sort_values() # default arguments self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0]) res = cat.sort_values(ascending=True, na_position='first') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0]) res = cat.sort_values(ascending=False, na_position='first') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan]) res = cat.sort_values(ascending=True, na_position='last') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan]) res = cat.sort_values(ascending=False, na_position='last') self.assert_numpy_array_equal(res.__array__(), exp) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) res = cat.sort_values(ascending=False, na_position='last') exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True) res = cat.sort_values(ascending=False, na_position='first') exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object) - exp_categories = np.array(["a", "b", "c", "d"], dtype=object) + exp_categories = Index(["a", "b", "c", "d"]) self.assert_numpy_array_equal(res.__array__(), exp_val) - self.assert_numpy_array_equal(res.categories, exp_categories) + self.assert_index_equal(res.categories, exp_categories) def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) @@ -1430,7 +1418,7 @@ def test_set_item_nan(self): cat = pd.Categorical([1, 2, 3]) exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) cat[1] = np.nan - self.assertTrue(cat.equals(exp)) + tm.assert_categorical_equal(cat, exp) # if nan in categories, the proper code should be set! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) @@ -1570,10 +1558,10 @@ def test_deprecated_levels(self): exp = cat.categories with tm.assert_produces_warning(FutureWarning): res = cat.levels - self.assert_numpy_array_equal(res, exp) + self.assert_index_equal(res, exp) with tm.assert_produces_warning(FutureWarning): res = pd.Categorical([1, 2, 3, np.nan], levels=[1, 2, 3]) - self.assert_numpy_array_equal(res.categories, exp) + self.assert_index_equal(res.categories, exp) def test_removed_names_produces_warning(self): @@ -1587,14 +1575,18 @@ def test_removed_names_produces_warning(self): def test_datetime_categorical_comparison(self): dt_cat = pd.Categorical( pd.date_range('2014-01-01', periods=3), ordered=True) - self.assert_numpy_array_equal(dt_cat > dt_cat[0], [False, True, True]) - self.assert_numpy_array_equal(dt_cat[0] < dt_cat, [False, True, True]) + self.assert_numpy_array_equal(dt_cat > dt_cat[0], + np.array([False, True, True])) + self.assert_numpy_array_equal(dt_cat[0] < dt_cat, + np.array([False, True, True])) def test_reflected_comparison_with_scalars(self): # GH8658 cat = pd.Categorical([1, 2, 3], ordered=True) - self.assert_numpy_array_equal(cat > cat[0], [False, True, True]) - self.assert_numpy_array_equal(cat[0] < cat, [False, True, True]) + self.assert_numpy_array_equal(cat > cat[0], + np.array([False, True, True])) + self.assert_numpy_array_equal(cat[0] < cat, + np.array([False, True, True])) def test_comparison_with_unknown_scalars(self): # https://github.com/pydata/pandas/issues/9836#issuecomment-92123057 @@ -1607,8 +1599,10 @@ def test_comparison_with_unknown_scalars(self): self.assertRaises(TypeError, lambda: 4 < cat) self.assertRaises(TypeError, lambda: 4 > cat) - self.assert_numpy_array_equal(cat == 4, [False, False, False]) - self.assert_numpy_array_equal(cat != 4, [True, True, True]) + self.assert_numpy_array_equal(cat == 4, + np.array([False, False, False])) + self.assert_numpy_array_equal(cat != 4, + np.array([True, True, True])) def test_map(self): c = pd.Categorical(list('ABABC'), categories=list('CBA'), @@ -1935,8 +1929,7 @@ def test_nan_handling(self): # Nans are represented as -1 in labels s = Series(Categorical(["a", "b", np.nan, "a"])) - self.assert_numpy_array_equal(s.cat.categories, - np.array(["a", "b"], dtype=np.object_)) + self.assert_index_equal(s.cat.categories, Index(["a", "b"])) self.assert_numpy_array_equal(s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8)) @@ -1946,8 +1939,8 @@ def test_nan_handling(self): s2 = Series(Categorical(["a", "b", np.nan, "a"], categories=["a", "b", np.nan])) - exp_cat = np.array(["a", "b", np.nan], dtype=np.object_) - self.assert_numpy_array_equal(s2.cat.categories, exp_cat) + exp_cat = Index(["a", "b", np.nan]) + self.assert_index_equal(s2.cat.categories, exp_cat) self.assert_numpy_array_equal(s2.values.codes, np.array([0, 1, 2, 0], dtype=np.int8)) @@ -1956,24 +1949,26 @@ def test_nan_handling(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): s3.cat.categories = ["a", "b", np.nan] - exp_cat = np.array(["a", "b", np.nan], dtype=np.object_) - self.assert_numpy_array_equal(s3.cat.categories, exp_cat) + exp_cat = Index(["a", "b", np.nan]) + self.assert_index_equal(s3.cat.categories, exp_cat) self.assert_numpy_array_equal(s3.values.codes, np.array([0, 1, 2, 0], dtype=np.int8)) def test_cat_accessor(self): s = Series(Categorical(["a", "b", np.nan, "a"])) - self.assert_numpy_array_equal(s.cat.categories, np.array(["a", "b"])) + self.assert_index_equal(s.cat.categories, Index(["a", "b"])) self.assertEqual(s.cat.ordered, False) exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) s.cat.set_categories(["b", "a"], inplace=True) - self.assertTrue(s.values.equals(exp)) + tm.assert_categorical_equal(s.values, exp) + res = s.cat.set_categories(["b", "a"]) - self.assertTrue(res.values.equals(exp)) + tm.assert_categorical_equal(res.values, exp) + exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) s[:] = "a" s = s.cat.remove_unused_categories() - self.assert_numpy_array_equal(s.cat.categories, np.array(["a"])) + self.assert_index_equal(s.cat.categories, Index(["a"])) def test_sequence_like(self): @@ -2015,11 +2010,11 @@ def test_series_delegations(self): # and the methods '.set_categories()' 'drop_unused_categories()' to the # categorical s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) - exp_categories = np.array(["a", "b", "c"]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + exp_categories = Index(["a", "b", "c"]) + tm.assert_index_equal(s.cat.categories, exp_categories) s.cat.categories = [1, 2, 3] - exp_categories = np.array([1, 2, 3]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + exp_categories = Index([1, 2, 3]) + self.assert_index_equal(s.cat.categories, exp_categories) exp_codes = Series([0, 1, 2, 0], dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) @@ -2032,20 +2027,20 @@ def test_series_delegations(self): # reorder s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) - exp_categories = np.array(["c", "b", "a"]) + exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) s = s.cat.set_categories(["c", "b", "a"]) - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + tm.assert_index_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c" ])) - exp_categories = np.array(["a", "b"], dtype=object) + exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() - self.assert_numpy_array_equal(s.cat.categories, exp_categories) + self.assert_index_equal(s.cat.categories, exp_categories) self.assert_numpy_array_equal(s.values.__array__(), exp_values) self.assert_numpy_array_equal(s.__array__(), exp_values) @@ -2092,11 +2087,11 @@ def test_assignment_to_dataframe(self): result1 = df['D'] result2 = df['E'] - self.assertTrue(result1._data._block.values.equals(d)) + self.assert_categorical_equal(result1._data._block.values, d) # sorting s.name = 'E' - self.assertTrue(result2.sort_index().equals(s.sort_index())) + self.assert_series_equal(result2.sort_index(), s.sort_index()) cat = pd.Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) df = pd.DataFrame(pd.Series(cat)) @@ -3152,7 +3147,7 @@ def test_sort_values(self): res = df.sort_values(by=["sort"], ascending=False) exp = df.sort_values(by=["string"], ascending=True) - self.assert_numpy_array_equal(res["values"], exp["values"]) + self.assert_series_equal(res["values"], exp["values"]) self.assertEqual(res["sort"].dtype, "category") self.assertEqual(res["unsort"].dtype, "category") @@ -3906,15 +3901,15 @@ def f(): df1 = df[0:3] df2 = df[3:] - self.assert_numpy_array_equal(df['grade'].cat.categories, - df1['grade'].cat.categories) - self.assert_numpy_array_equal(df['grade'].cat.categories, - df2['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + df1['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) dfx['grade'].cat.categories - self.assert_numpy_array_equal(df['grade'].cat.categories, - dfx['grade'].cat.categories) + self.assert_index_equal(df['grade'].cat.categories, + dfx['grade'].cat.categories) def test_concat_preserve(self): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index b6ed5dc68f905..cc0972937b8a2 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -287,7 +287,12 @@ def testit(): use_numexpr=True) expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) - tm.assert_numpy_array_equal(result, expected.values) + + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, + expected.values) result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') @@ -325,7 +330,10 @@ def testit(): use_numexpr=True) expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) - tm.assert_numpy_array_equal(result, expected.values) + if isinstance(result, DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_numpy_array_equal(result, expected.values) result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 36962a37ec898..83e1a17fc8b0c 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -1289,7 +1289,7 @@ def test_tz_convert_and_localize(self): df1 = DataFrame(np.ones(5), index=l0) df1 = getattr(df1, fn)('US/Pacific') - self.assertTrue(df1.index.equals(l0_expected)) + self.assert_index_equal(df1.index, l0_expected) # MultiIndex # GH7846 @@ -1297,14 +1297,14 @@ def test_tz_convert_and_localize(self): df3 = getattr(df2, fn)('US/Pacific', level=0) self.assertFalse(df3.index.levels[0].equals(l0)) - self.assertTrue(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1)) + self.assert_index_equal(df3.index.levels[0], l0_expected) + self.assert_index_equal(df3.index.levels[1], l1) self.assertFalse(df3.index.levels[1].equals(l1_expected)) df3 = getattr(df2, fn)('US/Pacific', level=1) - self.assertTrue(df3.index.levels[0].equals(l0)) + self.assert_index_equal(df3.index.levels[0], l0) self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1_expected)) + self.assert_index_equal(df3.index.levels[1], l1_expected) self.assertFalse(df3.index.levels[1].equals(l1)) df4 = DataFrame(np.ones(5), @@ -1313,9 +1313,9 @@ def test_tz_convert_and_localize(self): # TODO: untested df5 = getattr(df4, fn)('US/Pacific', level=1) # noqa - self.assertTrue(df3.index.levels[0].equals(l0)) + self.assert_index_equal(df3.index.levels[0], l0) self.assertFalse(df3.index.levels[0].equals(l0_expected)) - self.assertTrue(df3.index.levels[1].equals(l1_expected)) + self.assert_index_equal(df3.index.levels[1], l1_expected) self.assertFalse(df3.index.levels[1].equals(l1)) # Bad Inputs diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index b59d6ac0027dd..b09185c19bffb 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -706,14 +706,12 @@ def test_bar_log(self): expected = np.hstack((1.0e-04, expected, 1.0e+01)) ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') - tm.assert_numpy_array_equal(ax.get_ylim(), - (0.001, 0.10000000000000001)) + self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') - tm.assert_numpy_array_equal(ax.get_xlim(), - (0.001, 0.10000000000000001)) + self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) @slow @@ -2205,11 +2203,11 @@ def test_scatter_colors(self): ax = df.plot.scatter(x='a', y='b', c='c') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - (0, 0, 1, 1)) + np.array([0, 0, 1, 1], dtype=np.float64)) ax = df.plot.scatter(x='a', y='b', color='white') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - (1, 1, 1, 1)) + np.array([1, 1, 1, 1], dtype=np.float64)) @slow def test_plot_bar(self): diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 1996d132e01ba..6659e6b106a67 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1088,13 +1088,13 @@ def test_transform_broadcast(self): grouped = self.ts.groupby(lambda x: x.month) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.ts.index)) + self.assert_index_equal(result.index, self.ts.index) for _, gp in grouped: assert_fp_equal(result.reindex(gp.index), gp.mean()) grouped = self.tsframe.groupby(lambda x: x.month) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.tsframe.index)) + self.assert_index_equal(result.index, self.tsframe.index) for _, gp in grouped: agged = gp.mean() res = result.reindex(gp.index) @@ -1105,8 +1105,8 @@ def test_transform_broadcast(self): grouped = self.tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = grouped.transform(np.mean) - self.assertTrue(result.index.equals(self.tsframe.index)) - self.assertTrue(result.columns.equals(self.tsframe.columns)) + self.assert_index_equal(result.index, self.tsframe.index) + self.assert_index_equal(result.columns, self.tsframe.columns) for _, gp in grouped: agged = gp.mean(1) res = result.reindex(columns=gp.columns) @@ -2137,7 +2137,7 @@ def test_groupby_multiple_key(self): lambda x: x.day], axis=1) agged = grouped.agg(lambda x: x.sum()) - self.assertTrue(agged.index.equals(df.columns)) + self.assert_index_equal(agged.index, df.columns) assert_almost_equal(df.T.values, agged.values) agged = grouped.agg(lambda x: x.sum()) @@ -2549,7 +2549,7 @@ def f(piece): result = grouped.apply(f) tm.assertIsInstance(result, DataFrame) - self.assertTrue(result.index.equals(ts.index)) + self.assert_index_equal(result.index, ts.index) def test_apply_series_yield_constant(self): result = self.df.groupby(['A', 'B'])['C'].apply(len) @@ -2559,7 +2559,7 @@ def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) expected = grouped.count()['C'] - self.assertTrue(result.index.equals(expected.index)) + self.assert_index_equal(result.index, expected.index) self.assert_numpy_array_equal(result.values, expected.values) def test_apply_frame_concat_series(self): @@ -2673,26 +2673,26 @@ def test_groupby_with_hier_columns(self): df = DataFrame(np.random.randn(8, 4), index=index, columns=columns) result = df.groupby(level=0).mean() - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).mean() - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.index, df.index) result = df.groupby(level=0).agg(np.mean) - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0).apply(lambda x: x.mean()) - self.assertTrue(result.columns.equals(columns)) + self.assert_index_equal(result.columns, columns) result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) - self.assertTrue(result.columns.equals(Index(['A', 'B']))) - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.columns, Index(['A', 'B'])) + self.assert_index_equal(result.index, df.index) # add a nuisance column sorted_columns, _ = columns.sortlevel(0) df['A', 'foo'] = 'bar' result = df.groupby(level=0).mean() - self.assertTrue(result.columns.equals(df.columns[:-1])) + self.assert_index_equal(result.columns, df.columns[:-1]) def test_pass_args_kwargs(self): from numpy import percentile @@ -3413,18 +3413,18 @@ def test_panel_groupby(self): tm.assert_panel_equal(agged, agged2) - self.assert_numpy_array_equal(agged.items, [0, 1]) + self.assert_index_equal(agged.items, Index([0, 1])) grouped = self.panel.groupby(lambda x: x.month, axis='major') agged = grouped.mean() - self.assert_numpy_array_equal(agged.major_axis, sorted(list(set( - self.panel.major_axis.month)))) + exp = Index(sorted(list(set(self.panel.major_axis.month)))) + self.assert_index_equal(agged.major_axis, exp) grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis='minor') agged = grouped.mean() - self.assert_numpy_array_equal(agged.minor_axis, [0, 1]) + self.assert_index_equal(agged.minor_axis, Index([0, 1])) def test_numpy_groupby(self): from pandas.core.groupby import numpy_groupby @@ -3450,7 +3450,7 @@ def test_groupby_2d_malformed(self): d['label'] = ['l1', 'l2'] tmp = d.groupby(['group']).mean() res_values = np.array([[0, 1], [0, 1]], dtype=np.int64) - self.assert_numpy_array_equal(tmp.columns, ['zeros', 'ones']) + self.assert_index_equal(tmp.columns, Index(['zeros', 'ones'])) self.assert_numpy_array_equal(tmp.values, res_values) def test_int32_overflow(self): @@ -3489,10 +3489,10 @@ def test_int64_overflow(self): right = rg.sum()['values'] exp_index, _ = left.index.sortlevel(0) - self.assertTrue(left.index.equals(exp_index)) + self.assert_index_equal(left.index, exp_index) exp_index, _ = right.index.sortlevel(0) - self.assertTrue(right.index.equals(exp_index)) + self.assert_index_equal(right.index, exp_index) tups = list(map(tuple, df[['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H' ]].values)) @@ -3720,9 +3720,9 @@ def test_agg_multiple_functions_maintain_order(self): # GH #610 funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] result = self.df.groupby('A')['C'].agg(funcs) - exp_cols = ['mean', 'max', 'min'] + exp_cols = Index(['mean', 'max', 'min']) - self.assert_numpy_array_equal(result.columns, exp_cols) + self.assert_index_equal(result.columns, exp_cols) def test_multiple_functions_tuples_and_non_tuples(self): # #1359 @@ -4275,10 +4275,10 @@ def test_multiindex_columns_empty_level(self): df = DataFrame([[long(1), 'A']], columns=midx) grouped = df.groupby('to filter').groups - self.assert_numpy_array_equal(grouped['A'], [0]) + self.assertEqual(grouped['A'], [0]) grouped = df.groupby([('to filter', '')]).groups - self.assert_numpy_array_equal(grouped['A'], [0]) + self.assertEqual(grouped['A'], [0]) df = DataFrame([[long(1), 'A'], [long(2), 'B']], columns=midx) @@ -5853,25 +5853,23 @@ def test_lexsort_indexer(self): keys = [[nan] * 5 + list(range(100)) + [nan] * 5] # orders=True, na_position='last' result = _lexsort_indexer(keys, orders=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp)) # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp)) def test_nargsort(self): # np.argsort(items) places NaNs last @@ -5897,54 +5895,50 @@ def test_nargsort(self): # mergesort, ascending=True, na_position='last' result = _nargsort(items, kind='mergesort', ascending=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='first' result = _nargsort(items, kind='mergesort', ascending=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='last' result = _nargsort(items, kind='mergesort', ascending=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='first' result = _nargsort(items, kind='mergesort', ascending=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='last') - expected = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=True, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='last') - expected = list(range(104, 4, -1)) + list(range(5)) + list(range(105, - 110)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) # mergesort, ascending=False, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='first') - expected = list(range(5)) + list(range(105, 110)) + list(range(104, 4, - -1)) - tm.assert_numpy_array_equal(result, expected) + exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) def test_datetime_count(self): df = DataFrame({'a': [1, 2, 3] * 2, diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index bf9574f48913a..6a97f195abba7 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -17,15 +17,19 @@ import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas as pd +from pandas import lib from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn, assert_series_equal) from pandas.compat import zip, u def assert_block_equal(left, right): - assert_almost_equal(left.values, right.values) + tm.assert_numpy_array_equal(left.values, right.values) assert (left.dtype == right.dtype) - assert_almost_equal(left.mgr_locs, right.mgr_locs) + tm.assertIsInstance(left.mgr_locs, lib.BlockPlacement) + tm.assertIsInstance(right.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(left.mgr_locs.as_array, + right.mgr_locs.as_array) def get_numeric_mat(shape): @@ -207,7 +211,9 @@ def _check(blk): _check(self.bool_block) def test_mgr_locs(self): - assert_almost_equal(self.fblock.mgr_locs, [0, 2, 4]) + tm.assertIsInstance(self.fblock.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array, + np.array([0, 2, 4], dtype=np.int64)) def test_attrs(self): self.assertEqual(self.fblock.shape, self.fblock.values.shape) @@ -223,9 +229,10 @@ def test_merge(self): ablock = make_block(avals, ref_cols.get_indexer(['e', 'b'])) bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd'])) merged = ablock.merge(bblock) - assert_almost_equal(merged.mgr_locs, [0, 1, 2, 3]) - assert_almost_equal(merged.values[[0, 2]], avals) - assert_almost_equal(merged.values[[1, 3]], bvals) + tm.assert_numpy_array_equal(merged.mgr_locs.as_array, + np.array([0, 1, 2, 3], dtype=np.int64)) + tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals)) + tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals)) # TODO: merge with mixed type? @@ -246,17 +253,22 @@ def test_insert(self): def test_delete(self): newb = self.fblock.copy() newb.delete(0) - assert_almost_equal(newb.mgr_locs, [2, 4]) + tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([2, 4], dtype=np.int64)) self.assertTrue((newb.values[0] == 1).all()) newb = self.fblock.copy() newb.delete(1) - assert_almost_equal(newb.mgr_locs, [0, 4]) + tm.assertIsInstance(newb.mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([0, 4], dtype=np.int64)) self.assertTrue((newb.values[1] == 2).all()) newb = self.fblock.copy() newb.delete(2) - assert_almost_equal(newb.mgr_locs, [0, 2]) + tm.assert_numpy_array_equal(newb.mgr_locs.as_array, + np.array([0, 2], dtype=np.int64)) self.assertTrue((newb.values[1] == 1).all()) newb = self.fblock.copy() @@ -399,9 +411,9 @@ def test_get_scalar(self): for i, index in enumerate(self.mgr.axes[1]): res = self.mgr.get_scalar((item, index)) exp = self.mgr.get(item, fastpath=False)[i] - assert_almost_equal(res, exp) + self.assertEqual(res, exp) exp = self.mgr.get(item).internal_values()[i] - assert_almost_equal(res, exp) + self.assertEqual(res, exp) def test_get(self): cols = Index(list('abc')) @@ -421,10 +433,14 @@ def test_set(self): mgr.set('d', np.array(['foo'] * 3)) mgr.set('b', np.array(['bar'] * 3)) - assert_almost_equal(mgr.get('a').internal_values(), [0] * 3) - assert_almost_equal(mgr.get('b').internal_values(), ['bar'] * 3) - assert_almost_equal(mgr.get('c').internal_values(), [2] * 3) - assert_almost_equal(mgr.get('d').internal_values(), ['foo'] * 3) + tm.assert_numpy_array_equal(mgr.get('a').internal_values(), + np.array([0] * 3)) + tm.assert_numpy_array_equal(mgr.get('b').internal_values(), + np.array(['bar'] * 3, dtype=np.object_)) + tm.assert_numpy_array_equal(mgr.get('c').internal_values(), + np.array([2] * 3)) + tm.assert_numpy_array_equal(mgr.get('d').internal_values(), + np.array(['foo'] * 3, dtype=np.object_)) def test_insert(self): self.mgr.insert(0, 'inserted', np.arange(N)) @@ -689,8 +705,9 @@ def test_consolidate_ordering_issues(self): self.assertEqual(cons.nblocks, 4) cons = self.mgr.consolidate().get_numeric_data() self.assertEqual(cons.nblocks, 1) - assert_almost_equal(cons.blocks[0].mgr_locs, - np.arange(len(cons.items))) + tm.assertIsInstance(cons.blocks[0].mgr_locs, lib.BlockPlacement) + tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array, + np.arange(len(cons.items), dtype=np.int64)) def test_reindex_index(self): pass @@ -786,18 +803,18 @@ def test_get_bool_data(self): bools.get('bool').internal_values()) bools.set('bool', np.array([True, False, True])) - assert_almost_equal( - mgr.get('bool', fastpath=False), [True, False, True]) - assert_almost_equal( - mgr.get('bool').internal_values(), [True, False, True]) + tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), + np.array([True, False, True])) + tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), + np.array([True, False, True])) # Check sharing bools2 = mgr.get_bool_data(copy=True) bools2.set('bool', np.array([False, True, False])) - assert_almost_equal( - mgr.get('bool', fastpath=False), [True, False, True]) - assert_almost_equal( - mgr.get('bool').internal_values(), [True, False, True]) + tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False), + np.array([True, False, True])) + tm.assert_numpy_array_equal(mgr.get('bool').internal_values(), + np.array([True, False, True])) def test_unicode_repr_doesnt_raise(self): repr(create_mgr(u('b,\u05d0: object'))) @@ -892,8 +909,7 @@ def assert_slice_ok(mgr, axis, slobj): mat_slobj = (slice(None), ) * axis + (slobj, ) tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_matrix(), check_dtype=False) - tm.assert_numpy_array_equal(mgr.axes[axis][slobj], - sliced.axes[axis]) + tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): @@ -931,8 +947,8 @@ def assert_take_ok(mgr, axis, indexer): taken = mgr.take(indexer, axis) tm.assert_numpy_array_equal(np.take(mat, indexer, axis), taken.as_matrix(), check_dtype=False) - tm.assert_numpy_array_equal(mgr.axes[axis].take(indexer), - taken.axes[axis]) + tm.assert_index_equal(mgr.axes[axis].take(indexer), + taken.axes[axis]) for mgr in self.MANAGERS: for ax in range(mgr.ndim): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 63a8b49ab4b00..c4ccef13f2844 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -87,19 +87,19 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz)), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz))] expected = Index([1.1, 1.2, 1.3] + expected_tuples) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv2.append(idx1) expected = Index(expected_tuples + [1.1, 1.2, 1.3]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv2) - expected = MultiIndex.from_arrays([idx1.append(idx1), idx2.append(idx2) - ]) - self.assertTrue(result.equals(expected)) + expected = MultiIndex.from_arrays([idx1.append(idx1), + idx2.append(idx2)]) + self.assert_index_equal(result, expected) result = midx_lv2.append(midx_lv3) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = midx_lv3.append(midx_lv2) expected = Index._simple_new( @@ -107,7 +107,7 @@ def test_append_index(self): (1.2, datetime.datetime(2011, 1, 2, tzinfo=tz), 'B'), (1.3, datetime.datetime(2011, 1, 3, tzinfo=tz), 'C')] + expected_tuples), None) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_dataframe_constructor(self): multi = DataFrame(np.random.randn(4, 4), @@ -966,9 +966,7 @@ def check(left, right): assert_series_equal(left, right) self.assertFalse(left.index.is_unique) li, ri = left.index, right.index - for i in range(ri.nlevels): - tm.assert_numpy_array_equal(li.levels[i], ri.levels[i]) - tm.assert_numpy_array_equal(li.labels[i], ri.labels[i]) + tm.assert_index_equal(li, ri) df = DataFrame(np.arange(12).reshape(4, 3), index=list('abab'), @@ -1542,8 +1540,8 @@ def aggf(x): # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level] - self.assertTrue(leftside._get_axis(axis).equals(level_index)) - self.assertTrue(rightside._get_axis(axis).equals(level_index)) + self.assert_index_equal(leftside._get_axis(axis), level_index) + self.assert_index_equal(rightside._get_axis(axis), level_index) assert_frame_equal(leftside, rightside) @@ -2211,12 +2209,11 @@ def test_datetimeindex(self): tz='US/Eastern') idx = MultiIndex.from_arrays([idx1, idx2]) - expected1 = pd.DatetimeIndex( - ['2013-04-01 9:00', '2013-04-02 9:00', '2013-04-03 9:00' - ], tz='Asia/Tokyo') + expected1 = pd.DatetimeIndex(['2013-04-01 9:00', '2013-04-02 9:00', + '2013-04-03 9:00'], tz='Asia/Tokyo') - self.assertTrue(idx.levels[0].equals(expected1)) - self.assertTrue(idx.levels[1].equals(idx2)) + self.assert_index_equal(idx.levels[0], expected1) + self.assert_index_equal(idx.levels[1], idx2) # from datetime combos # GH 7888 @@ -2256,18 +2253,20 @@ def test_set_index_datetime(self): df.index = pd.to_datetime(df.pop('datetime'), utc=True) df.index = df.index.tz_localize('UTC').tz_convert('US/Pacific') - expected = pd.DatetimeIndex( - ['2011-07-19 07:00:00', '2011-07-19 08:00:00', - '2011-07-19 09:00:00']) + expected = pd.DatetimeIndex(['2011-07-19 07:00:00', + '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], name='datetime') expected = expected.tz_localize('UTC').tz_convert('US/Pacific') df = df.set_index('label', append=True) - self.assertTrue(df.index.levels[0].equals(expected)) - self.assertTrue(df.index.levels[1].equals(pd.Index(['a', 'b']))) + self.assert_index_equal(df.index.levels[0], expected) + self.assert_index_equal(df.index.levels[1], + pd.Index(['a', 'b'], name='label')) df = df.swaplevel(0, 1) - self.assertTrue(df.index.levels[0].equals(pd.Index(['a', 'b']))) - self.assertTrue(df.index.levels[1].equals(expected)) + self.assert_index_equal(df.index.levels[0], + pd.Index(['a', 'b'], name='label')) + self.assert_index_equal(df.index.levels[1], expected) df = DataFrame(np.random.random(6)) idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', @@ -2287,17 +2286,17 @@ def test_set_index_datetime(self): expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Eastern') - expected2 = pd.DatetimeIndex( - ['2012-04-01 09:00', '2012-04-02 09:00'], tz='US/Eastern') + expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], + tz='US/Eastern') - self.assertTrue(df.index.levels[0].equals(expected1)) - self.assertTrue(df.index.levels[1].equals(expected2)) - self.assertTrue(df.index.levels[2].equals(idx3)) + self.assert_index_equal(df.index.levels[0], expected1) + self.assert_index_equal(df.index.levels[1], expected2) + self.assert_index_equal(df.index.levels[2], idx3) # GH 7092 - self.assertTrue(df.index.get_level_values(0).equals(idx1)) - self.assertTrue(df.index.get_level_values(1).equals(idx2)) - self.assertTrue(df.index.get_level_values(2).equals(idx3)) + self.assert_index_equal(df.index.get_level_values(0), idx1) + self.assert_index_equal(df.index.get_level_values(1), idx2) + self.assert_index_equal(df.index.get_level_values(2), idx3) def test_reset_index_datetime(self): # GH 3950 @@ -2404,13 +2403,13 @@ def test_set_index_period(self): expected1 = pd.period_range('2011-01-01', periods=3, freq='M') expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') - self.assertTrue(df.index.levels[0].equals(expected1)) - self.assertTrue(df.index.levels[1].equals(expected2)) - self.assertTrue(df.index.levels[2].equals(idx3)) + self.assert_index_equal(df.index.levels[0], expected1) + self.assert_index_equal(df.index.levels[1], expected2) + self.assert_index_equal(df.index.levels[2], idx3) - self.assertTrue(df.index.get_level_values(0).equals(idx1)) - self.assertTrue(df.index.get_level_values(1).equals(idx2)) - self.assertTrue(df.index.get_level_values(2).equals(idx3)) + self.assert_index_equal(df.index.get_level_values(0), idx1) + self.assert_index_equal(df.index.get_level_values(1), idx2) + self.assert_index_equal(df.index.get_level_values(2), idx3) def test_repeat(self): # GH 9361 diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 7f8fb8fa424d1..e244a04127949 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -929,7 +929,7 @@ def test_axis(self): samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) skew = nanops.nanskew(samples, axis=1) - tm.assert_almost_equal(skew, [self.actual_skew, np.nan]) + tm.assert_almost_equal(skew, np.array([self.actual_skew, np.nan])) def test_nans(self): samples = np.hstack([self.samples, np.nan]) @@ -979,7 +979,7 @@ def test_axis(self): samples = np.vstack([self.samples, np.nan * np.ones(len(self.samples))]) kurt = nanops.nankurt(samples, axis=1) - tm.assert_almost_equal(kurt, [self.actual_kurt, np.nan]) + tm.assert_almost_equal(kurt, np.array([self.actual_kurt, np.nan])) def test_nans(self): samples = np.hstack([self.samples, np.nan]) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 87401f272adbd..7792a1f5d3509 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1086,12 +1086,12 @@ def test_ctor_dict(self): # TODO: unused? wp3 = Panel.from_dict(d3) # noqa - self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) + self.assert_index_equal(wp.major_axis, self.panel.major_axis) assert_panel_equal(wp, wp2) # intersect wp = Panel.from_dict(d, intersect=True) - self.assertTrue(wp.major_axis.equals(itemb.index[5:])) + self.assert_index_equal(wp.major_axis, itemb.index[5:]) # use constructor assert_panel_equal(Panel(d), Panel.from_dict(d)) @@ -1123,7 +1123,7 @@ def test_constructor_dict_mixed(self): data = dict((k, v.values) for k, v in self.panel.iteritems()) result = Panel(data) exp_major = Index(np.arange(len(self.panel.major_axis))) - self.assertTrue(result.major_axis.equals(exp_major)) + self.assert_index_equal(result.major_axis, exp_major) result = Panel(data, items=self.panel.items, major_axis=self.panel.major_axis, @@ -1213,8 +1213,8 @@ def test_conform(self): df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) conformed = self.panel.conform(df) - assert (conformed.index.equals(self.panel.major_axis)) - assert (conformed.columns.equals(self.panel.minor_axis)) + tm.assert_index_equal(conformed.index, self.panel.major_axis) + tm.assert_index_equal(conformed.columns, self.panel.minor_axis) def test_convert_objects(self): @@ -2078,11 +2078,11 @@ def test_rename(self): renamed = self.panel.rename_axis(mapper, axis=0) exp = Index(['foo', 'bar', 'baz']) - self.assertTrue(renamed.items.equals(exp)) + self.assert_index_equal(renamed.items, exp) renamed = self.panel.rename_axis(str.lower, axis=2) exp = Index(['a', 'b', 'c', 'd']) - self.assertTrue(renamed.minor_axis.equals(exp)) + self.assert_index_equal(renamed.minor_axis, exp) # don't copy renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) @@ -2485,7 +2485,7 @@ def test_axis_dummies(self): transformed = make_axis_dummies(self.panel, 'minor', transform=mapping.get) self.assertEqual(len(transformed.columns), 2) - self.assert_numpy_array_equal(transformed.columns, ['one', 'two']) + self.assert_index_equal(transformed.columns, Index(['one', 'two'])) # TODO: test correctness @@ -2578,10 +2578,10 @@ def _monotonic(arr): def test_panel_index(): index = panelm.panel_index([1, 2, 3, 4], [1, 2, 3]) - expected = MultiIndex.from_arrays([np.tile( - [1, 2, 3, 4], 3), np.repeat( - [1, 2, 3], 4)]) - assert (index.equals(expected)) + expected = MultiIndex.from_arrays([np.tile([1, 2, 3, 4], 3), + np.repeat([1, 2, 3], 4)], + names=['time', 'panel']) + tm.assert_index_equal(index, expected) def test_import_warnings(): diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index e3e906d48ae98..607048df29faa 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -733,7 +733,7 @@ def test_constructor_dict_mixed(self): data = dict((k, v.values) for k, v in self.panel4d.iteritems()) result = Panel4D(data) exp_major = Index(np.arange(len(self.panel4d.major_axis))) - self.assertTrue(result.major_axis.equals(exp_major)) + self.assert_index_equal(result.major_axis, exp_major) result = Panel4D(data, labels=self.panel4d.labels, @@ -799,9 +799,9 @@ def test_conform(self): p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) conformed = self.panel4d.conform(p) - assert(conformed.items.equals(self.panel4d.labels)) - assert(conformed.major_axis.equals(self.panel4d.major_axis)) - assert(conformed.minor_axis.equals(self.panel4d.minor_axis)) + tm.assert_index_equal(conformed.items, self.panel4d.labels) + tm.assert_index_equal(conformed.major_axis, self.panel4d.major_axis) + tm.assert_index_equal(conformed.minor_axis, self.panel4d.minor_axis) def test_reindex(self): ref = self.panel4d['l2'] @@ -1085,11 +1085,11 @@ def test_rename(self): renamed = self.panel4d.rename_axis(mapper, axis=0) exp = Index(['foo', 'bar', 'baz']) - self.assertTrue(renamed.labels.equals(exp)) + self.assert_index_equal(renamed.labels, exp) renamed = self.panel4d.rename_axis(str.lower, axis=3) exp = Index(['a', 'b', 'c', 'd']) - self.assertTrue(renamed.minor_axis.equals(exp)) + self.assert_index_equal(renamed.minor_axis, exp) # don't copy renamed_nocopy = self.panel4d.rename_axis(mapper, axis=0, copy=False) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 423a288077c4d..3d1851966afd0 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -48,12 +48,12 @@ def test_iter(self): # indices of each yielded Series should be equal to the index of # the original Series - tm.assert_numpy_array_equal(s.index, ds.index) + tm.assert_index_equal(s.index, ds.index) for el in s: # each element of the series is either a basestring/str or nan - self.assertTrue(isinstance(el, compat.string_types) or isnull( - el)) + self.assertTrue(isinstance(el, compat.string_types) or + isnull(el)) # desired behavior is to iterate until everything would be nan on the # next iter so make sure the last element of the iterator was 'l' in @@ -95,8 +95,8 @@ def test_iter_object_try_string(self): self.assertEqual(s, 'h') def test_cat(self): - one = ['a', 'a', 'b', 'b', 'c', NA] - two = ['a', NA, 'b', 'd', 'foo', NA] + one = np.array(['a', 'a', 'b', 'b', 'c', NA], dtype=np.object_) + two = np.array(['a', NA, 'b', 'd', 'foo', NA], dtype=np.object_) # single array result = strings.str_cat(one) @@ -121,21 +121,24 @@ def test_cat(self): # Multiple arrays result = strings.str_cat(one, [two], na_rep='NA') - exp = ['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'] + exp = np.array(['aa', 'aNA', 'bb', 'bd', 'cfoo', 'NANA'], + dtype=np.object_) self.assert_numpy_array_equal(result, exp) result = strings.str_cat(one, two) - exp = ['aa', NA, 'bb', 'bd', 'cfoo', NA] + exp = np.array(['aa', NA, 'bb', 'bd', 'cfoo', NA], dtype=np.object_) tm.assert_almost_equal(result, exp) def test_count(self): - values = ['foo', 'foofoo', NA, 'foooofooofommmfoo'] + values = np.array(['foo', 'foofoo', NA, 'foooofooofommmfoo'], + dtype=np.object_) result = strings.str_count(values, 'f[o]+') - exp = Series([1, 2, NA, 4]) - tm.assert_almost_equal(result, exp) + exp = np.array([1, 2, NA, 4]) + tm.assert_numpy_array_equal(result, exp) result = Series(values).str.count('f[o]+') + exp = Series([1, 2, NA, 4]) tm.assertIsInstance(result, Series) tm.assert_series_equal(result, exp) @@ -163,61 +166,66 @@ def test_count(self): tm.assert_series_equal(result, exp) def test_contains(self): - values = ['foo', NA, 'fooommm__foo', 'mmm_', 'foommm[_]+bar'] + values = np.array(['foo', NA, 'fooommm__foo', + 'mmm_', 'foommm[_]+bar'], dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) - expected = [False, NA, True, True, False] - tm.assert_almost_equal(result, expected) + expected = np.array([False, NA, True, True, False], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, regex=False) - expected = [False, NA, False, False, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, NA, False, False, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] result = strings.str_contains(values, pat) - expected = [False, False, True, True] + expected = np.array([False, False, True, True]) self.assertEqual(result.dtype, np.bool_) - tm.assert_almost_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # case insensitive using regex values = ['Foo', 'xYz', 'fOOomMm__fOo', 'MMM_'] result = strings.str_contains(values, 'FOO|mmm', case=False) - expected = [True, False, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([True, False, True, True]) + tm.assert_numpy_array_equal(result, expected) # case insensitive without regex result = strings.str_contains(values, 'foo', regex=False, case=False) - expected = [True, False, True, False] - tm.assert_almost_equal(result, expected) + expected = np.array([True, False, True, False]) + tm.assert_numpy_array_equal(result, expected) # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_contains(mixed, 'o') - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.contains('o') + xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assertIsInstance(rs, Series) tm.assert_series_equal(rs, xp) # unicode - values = [u('foo'), NA, u('fooommm__foo'), u('mmm_')] + values = np.array([u'foo', NA, u'fooommm__foo', u'mmm_'], + dtype=np.object_) pat = 'mmm[_]+' result = strings.str_contains(values, pat) - expected = [False, np.nan, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, np.nan, True, True], dtype=np.object_) + tm.assert_numpy_array_equal(result, expected) result = strings.str_contains(values, pat, na=False) - expected = [False, False, True, True] - tm.assert_almost_equal(result, expected) + expected = np.array([False, False, True, True]) + tm.assert_numpy_array_equal(result, expected) - values = ['foo', 'xyz', 'fooommm__foo', 'mmm_'] + values = np.array(['foo', 'xyz', 'fooommm__foo', 'mmm_'], + dtype=np.object_) result = strings.str_contains(values, pat) - expected = [False, False, True, True] + expected = np.array([False, False, True, True]) self.assertEqual(result.dtype, np.bool_) - tm.assert_almost_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) # na values = Series(['om', 'foo', np.nan]) @@ -232,13 +240,16 @@ def test_startswith(self): tm.assert_series_equal(result, exp) # mixed - mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] + mixed = np.array(['a', NA, 'b', True, datetime.today(), + 'foo', None, 1, 2.], dtype=np.object_) rs = strings.str_startswith(mixed, 'f') - xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, True, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.startswith('f') tm.assertIsInstance(rs, Series) + xp = Series([False, NA, False, NA, NA, True, NA, NA, NA]) tm.assert_series_equal(rs, xp) # unicode @@ -262,10 +273,12 @@ def test_endswith(self): # mixed mixed = ['a', NA, 'b', True, datetime.today(), 'foo', None, 1, 2.] rs = strings.str_endswith(mixed, 'f') - xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) - tm.assert_almost_equal(rs, xp) + xp = np.array([False, NA, False, NA, NA, False, NA, NA, NA], + dtype=np.object_) + tm.assert_numpy_array_equal(rs, xp) rs = Series(mixed).str.endswith('f') + xp = Series([False, NA, False, NA, NA, False, NA, NA, NA]) tm.assertIsInstance(rs, Series) tm.assert_series_equal(rs, xp) @@ -574,7 +587,12 @@ def test_extract_expand_False(self): s_or_idx = klass(['A1', 'A2']) result = s_or_idx.str.extract(r'(?PA)\d', expand=False) self.assertEqual(result.name, 'uno') - tm.assert_numpy_array_equal(result, klass(['A', 'A'])) + + exp = klass(['A', 'A'], name='uno') + if klass == Series: + tm.assert_series_equal(result, exp) + else: + tm.assert_index_equal(result, exp) s = Series(['A1', 'B2', 'C3']) # one group, no matches @@ -713,8 +731,9 @@ def test_extract_expand_True(self): # single group renames series/index properly s_or_idx = klass(['A1', 'A2']) result_df = s_or_idx.str.extract(r'(?PA)\d', expand=True) + tm.assertIsInstance(result_df, DataFrame) result_series = result_df['uno'] - tm.assert_numpy_array_equal(result_series, klass(['A', 'A'])) + assert_series_equal(result_series, Series(['A', 'A'], name='uno')) def test_extract_series(self): # extract should give the same result whether or not the @@ -1422,41 +1441,48 @@ def test_find_nan(self): tm.assert_series_equal(result, Series([4, np.nan, -1, np.nan, -1])) def test_index(self): + + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + for klass in [Series, Index]: s = klass(['ABCDEFG', 'BCDEFEF', 'DEFGHIJEF', 'EFGHEF']) result = s.str.index('EF') - tm.assert_numpy_array_equal(result, klass([4, 3, 1, 0])) + _check(result, klass([4, 3, 1, 0])) expected = np.array([v.index('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF') - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF') for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('EF', 3) - tm.assert_numpy_array_equal(result, klass([4, 3, 7, 4])) + _check(result, klass([4, 3, 7, 4])) expected = np.array([v.index('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('EF', 3) - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.rindex('EF', 3) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.index('E', 4, 8) - tm.assert_numpy_array_equal(result, klass([4, 5, 7, 4])) + _check(result, klass([4, 5, 7, 4])) expected = np.array([v.index('E', 4, 8) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) result = s.str.rindex('E', 0, 5) - tm.assert_numpy_array_equal(result, klass([4, 3, 1, 4])) + _check(result, klass([4, 3, 1, 4])) expected = np.array([v.rindex('E', 0, 5) for v in s.values], dtype=np.int64) tm.assert_numpy_array_equal(result.values, expected) @@ -1471,9 +1497,9 @@ def test_index(self): # test with nan s = Series(['abcb', 'ab', 'bcbe', np.nan]) result = s.str.index('b') - tm.assert_numpy_array_equal(result, Series([1, 1, 0, np.nan])) + tm.assert_series_equal(result, Series([1, 1, 0, np.nan])) result = s.str.rindex('b') - tm.assert_numpy_array_equal(result, Series([3, 1, 2, np.nan])) + tm.assert_series_equal(result, Series([3, 1, 2, np.nan])) def test_pad(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -1558,6 +1584,13 @@ def test_pad_fillchar(self): result = values.str.pad(5, fillchar=5) def test_translate(self): + + def _check(result, expected): + if isinstance(result, Series): + tm.assert_series_equal(result, expected) + else: + tm.assert_index_equal(result, expected) + for klass in [Series, Index]: s = klass(['abcdefg', 'abcc', 'cdddfg', 'cdefggg']) if not compat.PY3: @@ -1567,17 +1600,17 @@ def test_translate(self): table = str.maketrans('abc', 'cde') result = s.str.translate(table) expected = klass(['cdedefg', 'cdee', 'edddfg', 'edefggg']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) # use of deletechars is python 2 only if not compat.PY3: result = s.str.translate(table, deletechars='fg') expected = klass(['cdede', 'cdee', 'eddd', 'ede']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) result = s.str.translate(None, deletechars='fg') expected = klass(['abcde', 'abcc', 'cddd', 'cde']) - tm.assert_numpy_array_equal(result, expected) + _check(result, expected) else: with tm.assertRaisesRegexp( ValueError, "deletechars is not a valid argument"): @@ -1587,7 +1620,7 @@ def test_translate(self): s = Series(['a', 'b', 'c', 1.2]) expected = Series(['c', 'd', 'e', np.nan]) result = s.str.translate(table) - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) def test_center_ljust_rjust(self): values = Series(['a', 'b', NA, 'c', NA, 'eeeeee']) @@ -1985,8 +2018,8 @@ def test_rsplit_to_multiindex_expand(self): idx = Index(['some_equal_splits', 'with_no_nans']) result = idx.str.rsplit('_', expand=True, n=1) - exp = MultiIndex.from_tuples([('some_equal', 'splits'), ('with_no', - 'nans')]) + exp = MultiIndex.from_tuples([('some_equal', 'splits'), + ('with_no', 'nans')]) tm.assert_index_equal(result, exp) self.assertEqual(result.nlevels, 2) @@ -1996,7 +2029,7 @@ def test_split_with_name(self): # should preserve name s = Series(['a,b', 'c,d'], name='xxx') res = s.str.split(',') - exp = Series([('a', 'b'), ('c', 'd')], name='xxx') + exp = Series([['a', 'b'], ['c', 'd']], name='xxx') tm.assert_series_equal(res, exp) res = s.str.split(',', expand=True) @@ -2018,60 +2051,60 @@ def test_partition_series(self): values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h']) result = values.str.partition('_', expand=False) - exp = Series([['a', '_', 'b_c'], ['c', '_', 'd_e'], NA, ['f', '_', - 'g_h']]) + exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA, + ('f', '_', 'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([['a_b', '_', 'c'], ['c_d', '_', 'e'], NA, ['f_g', '_', - 'h']]) + exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA, + ('f_g', '_', 'h')]) tm.assert_series_equal(result, exp) # more than one char values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h']) result = values.str.partition('__', expand=False) - exp = Series([['a', '__', 'b__c'], ['c', '__', 'd__e'], NA, ['f', '__', - 'g__h']]) + exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA, + ('f', '__', 'g__h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('__', expand=False) - exp = Series([['a__b', '__', 'c'], ['c__d', '__', 'e'], NA, - ['f__g', '__', 'h']]) + exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA, + ('f__g', '__', 'h')]) tm.assert_series_equal(result, exp) # None values = Series(['a b c', 'c d e', NA, 'f g h']) result = values.str.partition(expand=False) - exp = Series([['a', ' ', 'b c'], ['c', ' ', 'd e'], NA, ['f', ' ', - 'g h']]) + exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA, + ('f', ' ', 'g h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition(expand=False) - exp = Series([['a b', ' ', 'c'], ['c d', ' ', 'e'], NA, ['f g', ' ', - 'h']]) + exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA, + ('f g', ' ', 'h')]) tm.assert_series_equal(result, exp) # Not splited values = Series(['abc', 'cde', NA, 'fgh']) result = values.str.partition('_', expand=False) - exp = Series([['abc', '', ''], ['cde', '', ''], NA, ['fgh', '', '']]) + exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([['', '', 'abc'], ['', '', 'cde'], NA, ['', '', 'fgh']]) + exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')]) tm.assert_series_equal(result, exp) # unicode - values = Series([u('a_b_c'), u('c_d_e'), NA, u('f_g_h')]) + values = Series([u'a_b_c', u'c_d_e', NA, u'f_g_h']) result = values.str.partition('_', expand=False) - exp = Series([[u('a'), u('_'), u('b_c')], [u('c'), u('_'), u('d_e')], - NA, [u('f'), u('_'), u('g_h')]]) + exp = Series([(u'a', u'_', u'b_c'), (u'c', u'_', u'd_e'), + NA, (u'f', u'_', u'g_h')]) tm.assert_series_equal(result, exp) result = values.str.rpartition('_', expand=False) - exp = Series([[u('a_b'), u('_'), u('c')], [u('c_d'), u('_'), u('e')], - NA, [u('f_g'), u('_'), u('h')]]) + exp = Series([(u'a_b', u'_', u'c'), (u'c_d', u'_', u'e'), + NA, (u'f_g', u'_', u'h')]) tm.assert_series_equal(result, exp) # compare to standard lib diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 357d53cb58c72..9cc76591e9b7b 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -43,6 +43,8 @@ def test_assert_almost_equal_numbers(self): def test_assert_almost_equal_numbers_with_zeros(self): self._assert_almost_equal_both(0, 0) + self._assert_almost_equal_both(0, 0.0) + self._assert_almost_equal_both(0, np.float64(0)) self._assert_almost_equal_both(0.000001, 0) self._assert_not_almost_equal_both(0.001, 0) @@ -81,9 +83,11 @@ def __getitem__(self, item): if item == 'a': return 1 - self._assert_almost_equal_both({'a': 1}, DictLikeObj()) + self._assert_almost_equal_both({'a': 1}, DictLikeObj(), + check_dtype=False) - self._assert_not_almost_equal_both({'a': 2}, DictLikeObj()) + self._assert_not_almost_equal_both({'a': 2}, DictLikeObj(), + check_dtype=False) def test_assert_almost_equal_strings(self): self._assert_almost_equal_both('abc', 'abc') @@ -95,7 +99,13 @@ def test_assert_almost_equal_strings(self): def test_assert_almost_equal_iterables(self): self._assert_almost_equal_both([1, 2, 3], [1, 2, 3]) - self._assert_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + self._assert_almost_equal_both(np.array([1, 2, 3]), + np.array([1, 2, 3])) + + # class / dtype are different + self._assert_not_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + self._assert_not_almost_equal_both(np.array([1, 2, 3]), + np.array([1., 2., 3.])) # Can't compare generators self._assert_not_almost_equal_both(iter([1, 2, 3]), [1, 2, 3]) @@ -106,8 +116,8 @@ def test_assert_almost_equal_iterables(self): def test_assert_almost_equal_null(self): self._assert_almost_equal_both(None, None) - self._assert_almost_equal_both(None, np.NaN) + self._assert_not_almost_equal_both(None, np.NaN) self._assert_not_almost_equal_both(None, 0) self._assert_not_almost_equal_both(np.NaN, 0) @@ -176,7 +186,7 @@ def test_numpy_array_equal_message(self): assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5])) # scalar comparison - expected = """: 1 != 2""" + expected = """Expected type """ with assertRaisesRegexp(AssertionError, expected): assert_numpy_array_equal(1, 2) expected = """expected 2\\.00000 but got 1\\.00000, with decimal 5""" @@ -191,6 +201,7 @@ def test_numpy_array_equal_message(self): \\[right\\]: int""" with assertRaisesRegexp(AssertionError, expected): + # numpy_array_equal only accepts np.ndarray assert_numpy_array_equal(np.array([1]), 1) with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(np.array([1]), 1) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index 854b7295aece4..4dd1cf54a5527 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -36,7 +36,8 @@ def test_backfill(self): filler = algos.backfill_int64(old.values, new.values) - expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] + expect_filler = np.array([0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 2, -1], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) # corner case @@ -44,7 +45,7 @@ def test_backfill(self): new = Index(lrange(5, 10)) filler = algos.backfill_int64(old.values, new.values) - expect_filler = [-1, -1, -1, -1, -1] + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) def test_pad(self): @@ -53,14 +54,15 @@ def test_pad(self): filler = algos.pad_int64(old.values, new.values) - expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] + expect_filler = np.array([-1, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 2, 2], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) # corner case old = Index([5, 10]) new = Index(lrange(5)) filler = algos.pad_int64(old.values, new.values) - expect_filler = [-1, -1, -1, -1, -1] + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) self.assert_numpy_array_equal(filler, expect_filler) @@ -113,9 +115,9 @@ def test_inner_join_indexer(): b = np.array([5], dtype=np.int64) index, ares, bres = algos.inner_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) def test_outer_join_indexer(): @@ -136,9 +138,9 @@ def test_outer_join_indexer(): b = np.array([5], dtype=np.int64) index, ares, bres = algos.outer_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) def test_left_join_indexer(): @@ -158,9 +160,9 @@ def test_left_join_indexer(): b = np.array([5], dtype=np.int64) index, ares, bres = algos.left_join_indexer_int64(a, b) - assert_almost_equal(index, [5]) - assert_almost_equal(ares, [0]) - assert_almost_equal(bres, [0]) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) def test_left_join_indexer2(): @@ -494,8 +496,8 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat( - np.arange(3), np.diff(np.r_[0, bins]))) + labels = com._ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) @@ -505,11 +507,12 @@ def _ohlc(group): return np.repeat(nan, 4) return [group[0], group.max(), group.min(), group[-1]] - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:]) - ]) + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) assert_almost_equal(out, expected) - assert_almost_equal(counts, [6, 6, 8]) + tm.assert_numpy_array_equal(counts, + np.array([6, 6, 8], dtype=np.int64)) obj[:6] = nan func(out, counts, obj[:, None], labels) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 1185f95dbd51f..2ec419221c6d8 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -12,10 +12,6 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, notnull, concat) -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal, - assert_index_equal, assert_numpy_array_equal, - slow) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.core.window as rwindow @@ -27,6 +23,13 @@ N, K = 100, 10 +def assert_equal(left, right): + if isinstance(left, Series): + tm.assert_series_equal(left, right) + else: + tm.assert_frame_equal(left, right) + + class Base(tm.TestCase): _multiprocess_can_split_ = True @@ -94,11 +97,11 @@ def tests_skip_nuisance(self): expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9], 'B': [np.nan, np.nan, 18, 21, 24]}, columns=list('AB')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -115,50 +118,51 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.aggregate({'A': np.mean, 'B': np.std}) expected = pd.concat([a_mean, b_std], axis=1) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std']}) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r['A'].aggregate(['mean', 'sum']) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) expected = pd.concat([a_mean, a_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', - 'sum')]) - assert_frame_equal(result, expected, check_like=True) + expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), + ('A', 'sum')]) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( - 'A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - assert_frame_equal(result, expected, check_like=True) + exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ( - 'A', 'std'), ('B', 'mean'), ('B', 'std')]) - assert_frame_equal(result, expected, check_like=True) + + exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')] + expected.columns = pd.MultiIndex.from_tuples(exp_cols) + tm.assert_frame_equal(result, expected, check_like=True) # passed lambda result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = r['B'].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([a_sum, rcustom], axis=1) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg_consistency(self): @@ -195,13 +199,13 @@ def f(): 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']}, 'B': {'rb': ['mean', 'std']}}) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) result = r.agg({'A': {'ra': ['mean', 'std']}, 'B': {'rb': ['mean', 'std']}}) expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), ( 'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) + tm.assert_frame_equal(result, expected, check_like=True) def test_window_with_args(self): tm._skip_if_no_scipy() @@ -213,7 +217,7 @@ def test_window_with_args(self): expected.columns = ['', ''] result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=.01)]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def a(x): return x.mean(std=10) @@ -224,7 +228,7 @@ def b(x): expected = pd.concat([r.mean(std=10), r.mean(std=.01)], axis=1) expected.columns = ['a', 'b'] result = r.aggregate([a, b]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_preserve_metadata(self): # GH 10565 @@ -262,7 +266,7 @@ def test_how_compat(self): expected = getattr( getattr(s, t)(freq='D', **kwargs), op)(how=how) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) class TestWindow(Base): @@ -555,7 +559,7 @@ def test_dtypes(self): def check_dtypes(self, f, f_name, d, d_name, exp): roll = d.rolling(window=self.window) result = f(roll) - assert_almost_equal(result, exp) + tm.assert_almost_equal(result, exp) class TestDtype_object(Dtype): @@ -642,7 +646,7 @@ def check_dtypes(self, f, f_name, d, d_name, exp): if f_name == 'count': result = f(roll) - assert_almost_equal(result, exp) + tm.assert_almost_equal(result, exp) else: @@ -714,11 +718,11 @@ def test_cmov_mean(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rs = mom.rolling_mean(vals, 5, center=True) - assert_almost_equal(xp, rs) + tm.assert_almost_equal(xp, rs) xp = Series(rs) rs = Series(vals).rolling(5, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window(self): # GH 8238 @@ -731,11 +735,11 @@ def test_cmov_window(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): rs = mom.rolling_window(vals, 5, 'boxcar', center=True) - assert_almost_equal(xp, rs) + tm.assert_almost_equal(xp, rs) xp = Series(rs) rs = Series(vals).rolling(5, win_type='boxcar', center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_corner(self): # GH 8238 @@ -777,7 +781,7 @@ def test_cmov_window_frame(self): # DataFrame rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean() - assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(DataFrame(xp), rs) # invalid method with self.assertRaises(AttributeError): @@ -791,7 +795,7 @@ def test_cmov_window_frame(self): ], [np.nan, np.nan]]) rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum() - assert_frame_equal(DataFrame(xp), rs) + tm.assert_frame_equal(DataFrame(xp), rs) def test_cmov_window_na_min_periods(self): tm._skip_if_no_scipy() @@ -804,7 +808,7 @@ def test_cmov_window_na_min_periods(self): xp = vals.rolling(5, min_periods=4, center=True).mean() rs = vals.rolling(5, win_type='boxcar', min_periods=4, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular(self): # GH 8238 @@ -837,7 +841,7 @@ def test_cmov_window_regular(self): for wt in win_types: xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular_linear_range(self): # GH 8238 @@ -854,7 +858,7 @@ def test_cmov_window_regular_linear_range(self): for wt in win_types: rs = Series(vals).rolling(5, win_type=wt, center=True).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_regular_missing_data(self): # GH 8238 @@ -887,7 +891,7 @@ def test_cmov_window_regular_missing_data(self): for wt in win_types: xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, min_periods=3).mean() - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_special(self): # GH 8238 @@ -914,7 +918,7 @@ def test_cmov_window_special(self): for wt, k in zip(win_types, kwds): xp = Series(xps[wt]) rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_cmov_window_special_linear_range(self): # GH 8238 @@ -932,7 +936,7 @@ def test_cmov_window_special_linear_range(self): for wt, k in zip(win_types, kwds): rs = Series(vals).rolling(5, win_type=wt, center=True).mean(**k) - assert_series_equal(xp, rs) + tm.assert_series_equal(xp, rs) def test_rolling_median(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -946,7 +950,7 @@ def test_rolling_min(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): a = np.array([1, 2, 3, 4, 5]) b = mom.rolling_min(a, window=100, min_periods=1) - assert_almost_equal(b, np.ones(len(a))) + tm.assert_almost_equal(b, np.ones(len(a))) self.assertRaises(ValueError, mom.rolling_min, np.array([1, 2, 3]), window=3, min_periods=5) @@ -958,7 +962,7 @@ def test_rolling_max(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): a = np.array([1, 2, 3, 4, 5], dtype=np.float64) b = mom.rolling_max(a, window=100, min_periods=1) - assert_almost_equal(a, b) + tm.assert_almost_equal(a, b) self.assertRaises(ValueError, mom.rolling_max, np.array([1, 2, 3]), window=3, min_periods=5) @@ -994,7 +998,8 @@ def test_rolling_apply(self): category=RuntimeWarning) ser = Series([]) - assert_series_equal(ser, ser.rolling(10).apply(lambda x: x.mean())) + tm.assert_series_equal(ser, + ser.rolling(10).apply(lambda x: x.mean())) f = lambda x: x[np.isfinite(x)].mean() @@ -1010,10 +1015,10 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False, s = Series([None, None, None]) result = s.rolling(2, min_periods=0).apply(lambda x: len(x)) expected = Series([1., 2., 2.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.rolling(2, min_periods=0).apply(len) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_apply_out_of_bounds(self): # #1850 @@ -1026,7 +1031,7 @@ def test_rolling_apply_out_of_bounds(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_apply(arr, 10, np.sum, min_periods=1) - assert_almost_equal(result, result) + tm.assert_almost_equal(result, result) def test_rolling_std(self): self._check_moment_func(mom.rolling_std, lambda x: np.std(x, ddof=1), @@ -1039,13 +1044,13 @@ def test_rolling_std_1obs(self): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1) expected = np.array([np.nan] * 5) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_std(np.array([1., 2., 3., 4., 5.]), 1, min_periods=1, ddof=0) expected = np.zeros(5) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = mom.rolling_std(np.array([np.nan, np.nan, 3., 4., 5.]), @@ -1159,7 +1164,7 @@ def get_result(arr, window, min_periods=None, center=False): kwargs) result = get_result(self.arr, window) - assert_almost_equal(result[-1], static_comp(self.arr[-50:])) + tm.assert_almost_equal(result[-1], static_comp(self.arr[-50:])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) @@ -1171,7 +1176,7 @@ def get_result(arr, window, min_periods=None, center=False): if has_min_periods: result = get_result(arr, 50, min_periods=30) - assert_almost_equal(result[-1], static_comp(arr[10:-10])) + tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) # min_periods is working correctly result = get_result(arr, 20, min_periods=15) @@ -1189,10 +1194,10 @@ def get_result(arr, window, min_periods=None, center=False): # min_periods=0 result0 = get_result(arr, 20, min_periods=0) result1 = get_result(arr, 20, min_periods=1) - assert_almost_equal(result0, result1) + tm.assert_almost_equal(result0, result1) else: result = get_result(arr, 50) - assert_almost_equal(result[-1], static_comp(arr[10:-10])) + tm.assert_almost_equal(result[-1], static_comp(arr[10:-10])) # GH 7925 if has_center: @@ -1210,7 +1215,8 @@ def get_result(arr, window, min_periods=None, center=False): if test_stable: result = get_result(self.arr + 1e9, window) - assert_almost_equal(result[-1], static_comp(self.arr[-50:] + 1e9)) + tm.assert_almost_equal(result[-1], + static_comp(self.arr[-50:] + 1e9)) # Test window larger than array, #7297 if test_window: @@ -1224,14 +1230,15 @@ def get_result(arr, window, min_periods=None, center=False): self.assertTrue(np.array_equal(nan_mask, np.isnan( expected))) nan_mask = ~nan_mask - assert_almost_equal(result[nan_mask], expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], + expected[nan_mask]) else: result = get_result(self.arr, len(self.arr) + 1) expected = get_result(self.arr, len(self.arr)) nan_mask = np.isnan(result) self.assertTrue(np.array_equal(nan_mask, np.isnan(expected))) nan_mask = ~nan_mask - assert_almost_equal(result[nan_mask], expected[nan_mask]) + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) def _check_structures(self, f, static_comp, name=None, has_min_periods=True, has_time_rule=True, @@ -1283,11 +1290,12 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) - assert_almost_equal(series_result[-1], static_comp(trunc_series)) + self.assertAlmostEqual(series_result[-1], + static_comp(trunc_series)) - assert_series_equal(frame_result.xs(last_date), - trunc_frame.apply(static_comp), - check_names=False) + tm.assert_series_equal(frame_result.xs(last_date), + trunc_frame.apply(static_comp), + check_names=False) # GH 7925 if has_center: @@ -1326,8 +1334,8 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): if fill_value is not None: series_xp = series_xp.fillna(fill_value) frame_xp = frame_xp.fillna(fill_value) - assert_series_equal(series_xp, series_rs) - assert_frame_equal(frame_xp, frame_rs) + tm.assert_series_equal(series_xp, series_rs) + tm.assert_frame_equal(frame_xp, frame_rs) def test_ewma(self): self._check_ew(mom.ewma, name='mean') @@ -1347,7 +1355,7 @@ def test_ewma(self): lambda s: s.ewm(com=2.0, adjust=True, ignore_na=True).mean(), ]: result = f(s) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = Series([1.0, 1.333333, 2.222222, 4.148148]) for f in [lambda s: s.ewm(com=2.0, adjust=False).mean(), @@ -1357,7 +1365,7 @@ def test_ewma(self): ignore_na=True).mean(), ]: result = f(s) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ewma_nan_handling(self): s = Series([1.] + [np.nan] * 5 + [1.]) @@ -1408,11 +1416,11 @@ def simple_wma(s, w): expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) if ignore_na is False: # check that ignore_na defaults to False result = s.ewm(com=com, adjust=adjust).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ewmvar(self): self._check_ew(mom.ewmvar, name='var') @@ -1424,7 +1432,7 @@ def test_ewma_span_com_args(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): A = mom.ewma(self.arr, com=9.5) B = mom.ewma(self.arr, span=20) - assert_almost_equal(A, B) + tm.assert_almost_equal(A, B) self.assertRaises(ValueError, mom.ewma, self.arr, com=9.5, span=20) self.assertRaises(ValueError, mom.ewma, self.arr) @@ -1433,7 +1441,7 @@ def test_ewma_halflife_arg(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): A = mom.ewma(self.arr, com=13.932726172912965) B = mom.ewma(self.arr, halflife=10.0) - assert_almost_equal(A, B) + tm.assert_almost_equal(A, B) self.assertRaises(ValueError, mom.ewma, self.arr, span=20, halflife=50) @@ -1450,9 +1458,9 @@ def test_ewma_alpha_old_api(self): b = mom.ewma(self.arr, com=0.62014947789973052) c = mom.ewma(self.arr, span=2.240298955799461) d = mom.ewma(self.arr, halflife=0.721792864318) - assert_numpy_array_equal(a, b) - assert_numpy_array_equal(a, c) - assert_numpy_array_equal(a, d) + tm.assert_numpy_array_equal(a, b) + tm.assert_numpy_array_equal(a, c) + tm.assert_numpy_array_equal(a, d) def test_ewma_alpha_arg_old_api(self): # GH 10789 @@ -1472,9 +1480,9 @@ def test_ewm_alpha(self): b = s.ewm(com=0.62014947789973052).mean() c = s.ewm(span=2.240298955799461).mean() d = s.ewm(halflife=0.721792864318).mean() - assert_series_equal(a, b) - assert_series_equal(a, c) - assert_series_equal(a, d) + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) def test_ewm_alpha_arg(self): # GH 10789 @@ -1516,7 +1524,7 @@ def test_ew_empty_arrays(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = f(arr, 3) - assert_almost_equal(result, arr) + tm.assert_almost_equal(result, arr) def _check_ew(self, func, name=None): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): @@ -1553,16 +1561,16 @@ def _check_ew_ndarray(self, func, preserve_nan=False, name=None): # check series of length 0 result = func(Series([]), 50, min_periods=min_periods) - assert_series_equal(result, Series([])) + tm.assert_series_equal(result, Series([])) # check series of length 1 result = func(Series([1.]), 50, min_periods=min_periods) if func == mom.ewma: - assert_series_equal(result, Series([1.])) + tm.assert_series_equal(result, Series([1.])) else: # ewmstd, ewmvol, ewmvar with bias=False require at least two # values - assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.NaN])) # pass in ints result2 = func(np.arange(50), span=10) @@ -1694,8 +1702,6 @@ def _non_null_values(x): return set(values[notnull(values)].tolist()) for (x, is_constant, no_nans) in self.data: - assert_equal = assert_series_equal if isinstance( - x, Series) else assert_frame_equal count_x = count(x) mean_x = mean(x) @@ -1800,7 +1806,7 @@ def _non_null_values(x): assert_equal(cov_x_y, mean_x_times_y - (mean_x * mean_y)) - @slow + @tm.slow def test_ewm_consistency(self): def _weights(s, com, adjust, ignore_na): if isinstance(s, DataFrame): @@ -1899,7 +1905,7 @@ def _ewma(s, com, min_periods, adjust, ignore_na): _variance_debiasing_factors(x, com=com, adjust=adjust, ignore_na=ignore_na))) - @slow + @tm.slow def test_expanding_consistency(self): # suppress warnings about empty slices, as we are deliberately testing @@ -1942,8 +1948,6 @@ def test_expanding_consistency(self): # expanding_apply of Series.xyz(), or (b) expanding_apply of # np.nanxyz() for (x, is_constant, no_nans) in self.data: - assert_equal = assert_series_equal if isinstance( - x, Series) else assert_frame_equal functions = self.base_functions # GH 8269 @@ -1988,9 +1992,9 @@ def test_expanding_consistency(self): x.iloc[:, i].expanding( min_periods=min_periods), name)(x.iloc[:, j]) - assert_panel_equal(expanding_f_result, expected) + tm.assert_panel_equal(expanding_f_result, expected) - @slow + @tm.slow def test_rolling_consistency(self): # suppress warnings about empty slices, as we are deliberately testing @@ -2062,10 +2066,6 @@ def cases(): # rolling_apply of Series.xyz(), or (b) rolling_apply of # np.nanxyz() for (x, is_constant, no_nans) in self.data: - - assert_equal = (assert_series_equal - if isinstance(x, Series) else - assert_frame_equal) functions = self.base_functions # GH 8269 @@ -2116,7 +2116,7 @@ def cases(): min_periods=min_periods, center=center), name)(x.iloc[:, j])) - assert_panel_equal(rolling_f_result, expected) + tm.assert_panel_equal(rolling_f_result, expected) # binary moments def test_rolling_cov(self): @@ -2124,7 +2124,7 @@ def test_rolling_cov(self): B = A + randn(len(A)) result = A.rolling(window=50, min_periods=25).cov(B) - assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_cov_pairwise(self): self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5) @@ -2134,7 +2134,7 @@ def test_rolling_corr(self): B = A + randn(len(A)) result = A.rolling(window=50, min_periods=25).corr(B) - assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) # test for correct bias correction a = tm.makeTimeSeries() @@ -2143,7 +2143,7 @@ def test_rolling_corr(self): b[:10] = np.nan result = a.rolling(window=len(a), min_periods=1).corr(b) - assert_almost_equal(result[-1], a.corr(b)) + tm.assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, @@ -2244,18 +2244,18 @@ def func(A, B, com, **kwargs): # check series of length 0 result = func(Series([]), Series([]), 50, min_periods=min_periods) - assert_series_equal(result, Series([])) + tm.assert_series_equal(result, Series([])) # check series of length 1 result = func( Series([1.]), Series([1.]), 50, min_periods=min_periods) - assert_series_equal(result, Series([np.NaN])) + tm.assert_series_equal(result, Series([np.NaN])) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) def test_expanding_apply(self): ser = Series([]) - assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) + tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean())) def expanding_mean(x, min_periods=1, freq=None): return mom.expanding_apply(x, lambda x: x.mean(), @@ -2267,7 +2267,7 @@ def expanding_mean(x, min_periods=1, freq=None): s = Series([None, None, None]) result = s.expanding(min_periods=0).apply(lambda x: len(x)) expected = Series([1., 2., 3.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_apply_args_kwargs(self): def mean_w_arg(x, const): @@ -2277,11 +2277,11 @@ def mean_w_arg(x, const): expected = df.expanding().apply(np.mean) + 20. - assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )), - expected) - assert_frame_equal(df.expanding().apply(mean_w_arg, - kwargs={'const': 20}), - expected) + tm.assert_frame_equal(df.expanding().apply(mean_w_arg, args=(20, )), + expected) + tm.assert_frame_equal(df.expanding().apply(mean_w_arg, + kwargs={'const': 20}), + expected) def test_expanding_corr(self): A = self.series.dropna() @@ -2291,11 +2291,11 @@ def test_expanding_corr(self): rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - assert_almost_equal(rolling_result, result) + tm.assert_almost_equal(rolling_result, result) def test_expanding_count(self): result = self.series.expanding().count() - assert_almost_equal(result, self.series.rolling( + tm.assert_almost_equal(result, self.series.rolling( window=len(self.series)).count()) def test_expanding_quantile(self): @@ -2304,7 +2304,7 @@ def test_expanding_quantile(self): rolling_result = self.series.rolling(window=len(self.series), min_periods=1).quantile(0.5) - assert_almost_equal(result, rolling_result) + tm.assert_almost_equal(result, rolling_result) def test_expanding_cov(self): A = self.series @@ -2314,7 +2314,7 @@ def test_expanding_cov(self): rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - assert_almost_equal(rolling_result, result) + tm.assert_almost_equal(rolling_result, result) def test_expanding_max(self): self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) @@ -2326,7 +2326,7 @@ def test_expanding_cov_pairwise(self): min_periods=1).corr() for i in result.items: - assert_almost_equal(result[i], rolling_result[i]) + tm.assert_almost_equal(result[i], rolling_result[i]) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() @@ -2335,7 +2335,7 @@ def test_expanding_corr_pairwise(self): min_periods=1).corr() for i in result.items: - assert_almost_equal(result[i], rolling_result[i]) + tm.assert_almost_equal(result[i], rolling_result[i]) def test_expanding_cov_diff_index(self): # GH 7512 @@ -2343,17 +2343,17 @@ def test_expanding_cov_diff_index(self): s2 = Series([1, 3], index=[0, 2]) result = s1.expanding().cov(s2) expected = Series([None, None, 2.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.expanding().cov(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().cov(s2) expected = Series([None, None, None, 4.5]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_corr_diff_index(self): # GH 7512 @@ -2361,17 +2361,17 @@ def test_expanding_corr_diff_index(self): s2 = Series([1, 3], index=[0, 2]) result = s1.expanding().corr(s2) expected = Series([None, None, 1.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.expanding().corr(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s1 = Series([7, 8, 10], index=[0, 1, 3]) s2 = Series([7, 9, 10], index=[0, 2, 3]) result = s1.expanding().corr(s2) expected = Series([None, None, None, 1.]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_cov_diff_length(self): # GH 7512 @@ -2379,11 +2379,11 @@ def test_rolling_cov_diff_length(self): s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2) expected = Series([None, None, 2.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).cov(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_corr_diff_length(self): # GH 7512 @@ -2391,11 +2391,11 @@ def test_rolling_corr_diff_length(self): s2 = Series([1, 3], index=[0, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2) expected = Series([None, None, 1.0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s2a = Series([1, None, 3], index=[0, 1, 2]) result = s1.rolling(window=3, min_periods=2).corr(s2a) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_functions_window_non_shrinkage(self): # GH 7764 @@ -2427,10 +2427,10 @@ def test_rolling_functions_window_non_shrinkage(self): for f in functions: try: s_result = f(s) - assert_series_equal(s_result, s_expected) + tm.assert_series_equal(s_result, s_expected) df_result = f(df) - assert_frame_equal(df_result, df_expected) + tm.assert_frame_equal(df_result, df_expected) except (ImportError): # scipy needed for rolling_window @@ -2442,7 +2442,7 @@ def test_rolling_functions_window_non_shrinkage(self): .corr(x, pairwise=True))] for f in functions: df_result_panel = f(df) - assert_panel_equal(df_result_panel, df_expected_panel) + tm.assert_panel_equal(df_result_panel, df_expected_panel) def test_moment_functions_zero_length(self): # GH 8056 @@ -2497,13 +2497,13 @@ def test_moment_functions_zero_length(self): for f in functions: try: s_result = f(s) - assert_series_equal(s_result, s_expected) + tm.assert_series_equal(s_result, s_expected) df1_result = f(df1) - assert_frame_equal(df1_result, df1_expected) + tm.assert_frame_equal(df1_result, df1_expected) df2_result = f(df2) - assert_frame_equal(df2_result, df2_expected) + tm.assert_frame_equal(df2_result, df2_expected) except (ImportError): # scipy needed for rolling_window @@ -2520,10 +2520,10 @@ def test_moment_functions_zero_length(self): ] for f in functions: df1_result_panel = f(df1) - assert_panel_equal(df1_result_panel, df1_expected_panel) + tm.assert_panel_equal(df1_result_panel, df1_expected_panel) df2_result_panel = f(df2) - assert_panel_equal(df2_result_panel, df2_expected_panel) + tm.assert_panel_equal(df2_result_panel, df2_expected_panel) def test_expanding_cov_pairwise_diff_length(self): # GH 7512 @@ -2537,10 +2537,10 @@ def test_expanding_cov_pairwise_diff_length(self): result4 = df1a.expanding().cov(df2a, pairwise=True)[2] expected = DataFrame([[-3., -5.], [-6., -10.]], index=['A', 'B'], columns=['X', 'Y']) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - assert_frame_equal(result4, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) def test_expanding_corr_pairwise_diff_length(self): # GH 7512 @@ -2554,35 +2554,29 @@ def test_expanding_corr_pairwise_diff_length(self): result4 = df1a.expanding().corr(df2a, pairwise=True)[2] expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], index=['A', 'B'], columns=['X', 'Y']) - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) - assert_frame_equal(result3, expected) - assert_frame_equal(result4, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) def test_pairwise_stats_column_names_order(self): # GH 7738 df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 'C']), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame( - [[2., 4.], [1., 2.], [5., 2.], [8., 1.]], columns=[1, 0.]), - DataFrame( - [[2, 4.], [1, 2.], [5, 2.], [8, 1.]], columns=[0, 1.]), - DataFrame( - [[2, 4], [1, 2], [5, 2], [8, 1.]], columns=[1., 'X']), ] - df2 = DataFrame( - [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1] - ], columns=['Y', 'Z', 'X']) + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], + columns=['C', 'C']), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), + DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], + columns=[1, 0.]), + DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], + columns=[0, 1.]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], + columns=[1., 'X']), ] + df2 = DataFrame([[None, 1, 1], [None, 1, 2], + [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) s = Series([1, 1, 3, 8]) # suppress warnings about incomparable objects, as we are deliberately @@ -2596,11 +2590,13 @@ def test_pairwise_stats_column_names_order(self): for f in [lambda x: x.cov(), lambda x: x.corr(), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.columns) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + # compare internal values, as columns can be different + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with itself, pairwise=True for f in [lambda x: x.expanding().cov(pairwise=True), @@ -2611,12 +2607,13 @@ def test_pairwise_stats_column_names_order(self): lambda x: x.ewm(com=3).corr(pairwise=True), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df.columns) + tm.assert_index_equal(result.items, df.index) + tm.assert_index_equal(result.major_axis, df.columns) + tm.assert_index_equal(result.minor_axis, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with itself, pairwise=False for f in [lambda x: x.expanding().cov(pairwise=False), @@ -2627,11 +2624,12 @@ def test_pairwise_stats_column_names_order(self): lambda x: x.ewm(com=3).corr(pairwise=False), ]: results = [f(df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with another DataFrame, pairwise=True for f in [lambda x, y: x.expanding().cov(y, pairwise=True), @@ -2642,12 +2640,13 @@ def test_pairwise_stats_column_names_order(self): lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]: results = [f(df, df2) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.items, df.index) - assert_index_equal(result.major_axis, df.columns) - assert_index_equal(result.minor_axis, df2.columns) + tm.assert_index_equal(result.items, df.index) + tm.assert_index_equal(result.major_axis, df.columns) + tm.assert_index_equal(result.minor_axis, df2.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) # DataFrame with another DataFrame, pairwise=False for f in [lambda x, y: x.expanding().cov(y, pairwise=False), @@ -2662,8 +2661,8 @@ def test_pairwise_stats_column_names_order(self): if result is not None: expected_index = df.index.union(df2.index) expected_columns = df.columns.union(df2.columns) - assert_index_equal(result.index, expected_index) - assert_index_equal(result.columns, expected_columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) else: tm.assertRaisesRegexp( ValueError, "'arg1' columns are not unique", f, df, @@ -2681,11 +2680,12 @@ def test_pairwise_stats_column_names_order(self): lambda x, y: x.ewm(com=3).corr(y), ]: results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] for (df, result) in zip(df1s, results): - assert_index_equal(result.index, df.index) - assert_index_equal(result.columns, df.columns) + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) for i, result in enumerate(results): if i > 0: - self.assert_numpy_array_equal(result, results[0]) + self.assert_numpy_array_equal(result.values, + results[0].values) def test_rolling_skew_edge_cases(self): @@ -2694,19 +2694,19 @@ def test_rolling_skew_edge_cases(self): # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).skew() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=2).skew() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 0.177994, 1.548824] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 ]) expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824]) x = d.rolling(window=4).skew() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_kurt_edge_cases(self): @@ -2715,25 +2715,25 @@ def test_rolling_kurt_edge_cases(self): # yields all NaN (0 variance) d = Series([1] * 5) x = d.rolling(window=5).kurt() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields all NaN (window too small) d = Series(np.random.randn(5)) x = d.rolling(window=3).kurt() - assert_series_equal(all_nan, x) + tm.assert_series_equal(all_nan, x) # yields [NaN, NaN, NaN, 1.224307, 2.671499] d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401 ]) expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499]) x = d.rolling(window=4).kurt() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) - assert_almost_equal(result[10], static_comp(self.arr[:11])) + tm.assert_almost_equal(result[10], static_comp(self.arr[:11])) if preserve_nan: assert (np.isnan(result[self._nan_locs]).all()) @@ -2743,7 +2743,7 @@ def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, if has_min_periods: result = func(arr, min_periods=30) assert (np.isnan(result[:29]).all()) - assert_almost_equal(result[-1], static_comp(arr[:50])) + tm.assert_almost_equal(result[-1], static_comp(arr[:50])) # min_periods is working correctly result = func(arr, min_periods=15) @@ -2758,10 +2758,10 @@ def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, # min_periods=0 result0 = func(arr, min_periods=0) result1 = func(arr, min_periods=1) - assert_almost_equal(result0, result1) + tm.assert_almost_equal(result0, result1) else: result = func(arr) - assert_almost_equal(result[-1], static_comp(arr[:50])) + tm.assert_almost_equal(result[-1], static_comp(arr[:50])) def _check_expanding_structures(self, func): series_result = func(self.series) @@ -2795,7 +2795,7 @@ def test_rolling_max_gh6297(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_max_how_resample(self): @@ -2814,14 +2814,14 @@ def test_rolling_max_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max(how='median') - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0 + 10.0 + 20.0) / 3.0 @@ -2829,7 +2829,7 @@ def test_rolling_max_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').max(how='mean') - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_min_how_resample(self): @@ -2848,7 +2848,7 @@ def test_rolling_min_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): r = series.rolling(window=1, freq='D') - assert_series_equal(expected, r.min()) + tm.assert_series_equal(expected, r.min()) def test_rolling_median_how_resample(self): @@ -2867,7 +2867,7 @@ def test_rolling_median_how_resample(self): index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): x = series.rolling(window=1, freq='D').median() - assert_series_equal(expected, x) + tm.assert_series_equal(expected, x) def test_rolling_median_memory_error(self): # GH11722 @@ -2917,16 +2917,16 @@ def test_getitem(self): expected = g_mutated.B.apply(lambda x: x.rolling(2).mean()) result = g.rolling(2).mean().B - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = g.rolling(2).B.mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = g.B.rolling(2).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = self.frame.B.groupby(self.frame.A).rolling(2).mean() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_getitem_multiple(self): @@ -2937,10 +2937,10 @@ def test_getitem_multiple(self): expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) result = r.B.count() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = r.B.count() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling(self): g = self.frame.groupby('A') @@ -2949,16 +2949,16 @@ def test_rolling(self): for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) for f in ['std', 'var']: result = getattr(r, f)(ddof=1) expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = g.apply(lambda x: x.rolling(4).quantile(0.5)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_rolling_corr_cov(self): g = self.frame.groupby('A') @@ -2970,14 +2970,14 @@ def test_rolling_corr_cov(self): def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) def func(x): return getattr(x.B.rolling(4), f)(pairwise=True) expected = g.apply(func) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_rolling_apply(self): g = self.frame.groupby('A') @@ -2986,7 +2986,7 @@ def test_rolling_apply(self): # reduction result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum())) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_expanding(self): g = self.frame.groupby('A') @@ -2995,16 +2995,16 @@ def test_expanding(self): for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) for f in ['std', 'var']: result = getattr(r, f)(ddof=0) expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = g.apply(lambda x: x.expanding().quantile(0.5)) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_expanding_corr_cov(self): g = self.frame.groupby('A') @@ -3016,14 +3016,14 @@ def test_expanding_corr_cov(self): def func(x): return getattr(x.expanding(), f)(self.frame) expected = g.apply(func) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) def func(x): return getattr(x.B.expanding(), f)(pairwise=True) expected = g.apply(func) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_expanding_apply(self): g = self.frame.groupby('A') @@ -3032,4 +3032,4 @@ def test_expanding_apply(self): # reduction result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 62bd12130ca53..9d9b0635e0f35 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -266,7 +266,8 @@ def test_concat_keys_specific_levels(self): levels=[level], names=['group_key']) - self.assert_numpy_array_equal(result.columns.levels[0], level) + self.assert_index_equal(result.columns.levels[0], + Index(level, name='group_key')) self.assertEqual(result.columns.names[0], 'group_key') def test_concat_dataframe_keys_bug(self): @@ -413,7 +414,8 @@ def test_concat_keys_and_levels(self): ('baz', 'one'), ('baz', 'two')], names=['first', 'second']) self.assertEqual(result.index.names, ('first', 'second') + (None,)) - self.assert_numpy_array_equal(result.index.levels[0], ['baz', 'foo']) + self.assert_index_equal(result.index.levels[0], + Index(['baz', 'foo'], name='first')) def test_concat_keys_levels_no_overlap(self): # GH #1406 diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index efbe4c17ea544..2505309768997 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -200,8 +200,10 @@ def test_join_on(self): source = self.source merged = target.join(source, on='C') - self.assert_numpy_array_equal(merged['MergedA'], target['A']) - self.assert_numpy_array_equal(merged['MergedD'], target['D']) + self.assert_series_equal(merged['MergedA'], target['A'], + check_names=False) + self.assert_series_equal(merged['MergedD'], target['D'], + check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -286,7 +288,7 @@ def test_join_with_len0(self): merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') - self.assertTrue(merged2.columns.equals(merged.columns)) + self.assert_index_equal(merged2.columns, merged.columns) self.assertEqual(len(merged2), 0) def test_join_on_inner(self): @@ -297,9 +299,11 @@ def test_join_on_inner(self): expected = df.join(df2, on='key') expected = expected[expected['value'].notnull()] - self.assert_numpy_array_equal(joined['key'], expected['key']) - self.assert_numpy_array_equal(joined['value'], expected['value']) - self.assertTrue(joined.index.equals(expected.index)) + self.assert_series_equal(joined['key'], expected['key'], + check_dtype=False) + self.assert_series_equal(joined['value'], expected['value'], + check_dtype=False) + self.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) @@ -662,7 +666,7 @@ def test_join_sort(self): # smoke test joined = left.join(right, on='key', sort=False) - self.assert_numpy_array_equal(joined.index, lrange(4)) + self.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -722,15 +726,16 @@ def test_handle_join_key_pass_array(self): rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') - self.assert_numpy_array_equal(merged['key_0'], - np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])) + self.assert_series_equal(merged['key_0'], + Series([1, 1, 1, 1, 2, 2, 3, 4, 5], + name='key_0')) left = DataFrame({'value': lrange(3)}) right = DataFrame({'rvalue': lrange(6)}) - key = np.array([0, 1, 1, 2, 2, 3]) + key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64) merged = merge(left, right, left_index=True, right_on=key, how='outer') - self.assert_numpy_array_equal(merged['key_0'], key) + self.assert_series_equal(merged['key_0'], Series(key, name='key_0')) def test_mixed_type_join_with_suffix(self): # GH #916 diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 55f27e1466a92..0b91fd1ef1c02 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -4,7 +4,7 @@ import numpy as np from pandas.compat import zip -from pandas import Series +from pandas import Series, Index import pandas.util.testing as tm from pandas.util.testing import assertRaisesRegexp import pandas.core.common as com @@ -19,32 +19,41 @@ class TestCut(tm.TestCase): def test_simple(self): data = np.ones(5) result = cut(data, 4, labels=False) - desired = [1, 1, 1, 1, 1] + desired = np.array([1, 1, 1, 1, 1], dtype=np.int64) tm.assert_numpy_array_equal(result, desired) def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 1, 2, 0]) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + + exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) + tm.assert_almost_equal(bins, exp) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 2, 3, 0, 0]) - tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + exp_codes = np.array([0, 0, 0, 2, 3, 0, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 2.575, 4.95, 7.325, 9.7]) + tm.assert_numpy_array_equal(bins, exp) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 2, 3, 0, 1]) - tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095]) + tm.assert_almost_equal(bins, exp) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) - tm.assert_numpy_array_equal(result.codes, [0, 0, 0, 1, 2, 0]) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) + tm.assert_numpy_array_equal(result.codes, exp_codes) + exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) + tm.assert_almost_equal(bins, exp) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -72,14 +81,14 @@ def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) - ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', - '(0.75, 1]'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', + '(0.75, 1]']) + self.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) - ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', - '[0.75, 1.001)'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', + '[0.75, 1.001)']) + self.assert_index_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -91,9 +100,9 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', - '(0.54, 0.72]'] - self.assert_numpy_array_equal(result.categories, ex_levels) + ex_levels = Index(['(-0.00072, 0.18]', '(0.18, 0.36]', + '(0.36, 0.54]', '(0.54, 0.72]']) + self.assert_index_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -118,10 +127,10 @@ def test_inf_handling(self): result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) - ex_categories = ['(-inf, 2]', '(2, 4]', '(4, inf]'] + ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]']) - tm.assert_numpy_array_equal(result.categories, ex_categories) - tm.assert_numpy_array_equal(result_ser.cat.categories, ex_categories) + tm.assert_index_equal(result.categories, ex_categories) + tm.assert_index_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], '(4, inf]') self.assertEqual(result[0], '(-inf, 2]') self.assertEqual(result_ser[5], '(4, inf]') @@ -135,7 +144,7 @@ def test_qcut(self): tm.assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_numpy_array_equal(labels, ex_levels) + self.assert_categorical_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -148,7 +157,7 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - self.assertTrue(factor.equals(expected)) + tm.assert_categorical_equal(factor, expected) def test_qcut_all_bins_same(self): assertRaisesRegexp(ValueError, "edges.*unique", qcut, @@ -173,7 +182,7 @@ def test_cut_pass_labels(self): exp = cut(arr, bins) exp.categories = labels - self.assertTrue(result.equals(exp)) + tm.assert_categorical_equal(result, exp) def test_qcut_include_lowest(self): values = np.arange(10) @@ -253,12 +262,14 @@ def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + tm.assert_numpy_array_equal(result.cat.codes.values, + np.array([0, 0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) result, bins = qcut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [0, 1.5, 3]) + tm.assert_numpy_array_equal(result.cat.codes.values, + np.array([0, 0, 1, 1], dtype=np.int8)) + tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) def curpath(): diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 92a41199f264d..4e704554f982f 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -18,18 +18,21 @@ class TestCartesianProduct(tm.TestCase): def test_simple(self): x, y = list('ABC'), [1, 22] - result = cartesian_product([x, y]) - expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']), - np.array([1, 22, 1, 22, 1, 22])] - tm.assert_numpy_array_equal(result, expected) + result1, result2 = cartesian_product([x, y]) + expected1 = np.array(['A', 'A', 'B', 'B', 'C', 'C']) + expected2 = np.array([1, 22, 1, 22, 1, 22]) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) def test_datetimeindex(self): # regression test for GitHub issue #6439 # make sure that the ordering on datetimeindex is consistent x = date_range('2000-01-01', periods=2) - result = [Index(y).day for y in cartesian_product([x, x])] - expected = [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])] - tm.assert_numpy_array_equal(result, expected) + result1, result2 = [Index(y).day for y in cartesian_product([x, x])] + expected1 = np.array([1, 1, 2, 2], dtype=np.int32) + expected2 = np.array([1, 2, 1, 2], dtype=np.int32) + tm.assert_numpy_array_equal(result1, expected1) + tm.assert_numpy_array_equal(result2, expected2) class TestLocaleUtils(tm.TestCase): diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 97b551070f541..7077a23d5abcb 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -62,7 +62,7 @@ def test_asobject_tolist(self): self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -76,7 +76,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -89,7 +89,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -726,7 +726,7 @@ def test_asobject_tolist(self): self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -738,7 +738,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) @@ -1489,7 +1489,7 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(idx.tolist(), expected_list) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 6e572289a3cae..6ad33b6b973de 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -25,15 +25,16 @@ def eq_gen_range(kwargs, expected): class TestGenRangeGeneration(tm.TestCase): + def test_generate(self): rng1 = list(generate_range(START, END, offset=datetools.bday)) rng2 = list(generate_range(START, END, time_rule='B')) - self.assert_numpy_array_equal(rng1, rng2) + self.assertEqual(rng1, rng2) def test_generate_cday(self): rng1 = list(generate_range(START, END, offset=datetools.cday)) rng2 = list(generate_range(START, END, time_rule='C')) - self.assert_numpy_array_equal(rng1, rng2) + self.assertEqual(rng1, rng2) def test_1(self): eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), @@ -68,8 +69,8 @@ def test_precision_finer_than_offset(self): freq='Q-DEC', tz=None) expected2 = DatetimeIndex(expected2_list, dtype='datetime64[ns]', freq='W-SUN', tz=None) - self.assertTrue(result1.equals(expected1)) - self.assertTrue(result2.equals(expected2)) + self.assert_index_equal(result1, expected1) + self.assert_index_equal(result2, expected2) class TestDateRange(tm.TestCase): @@ -140,7 +141,7 @@ def test_comparison(self): def test_copy(self): cp = self.rng.copy() repr(cp) - self.assertTrue(cp.equals(self.rng)) + self.assert_index_equal(cp, self.rng) def test_repr(self): # only really care that it works @@ -148,7 +149,9 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) + self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -211,7 +214,7 @@ def test_union(self): tm.assertIsInstance(the_union, DatetimeIndex) # order does not matter - self.assert_numpy_array_equal(right.union(left), the_union) + tm.assert_index_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -256,13 +259,13 @@ def test_union_not_cacheable(self): rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2) - self.assertTrue(the_union.equals(rng)) + self.assert_index_equal(the_union, rng) rng1 = rng[10:] rng2 = rng[15:35] the_union = rng1.union(rng2) expected = rng[10:] - self.assertTrue(the_union.equals(expected)) + self.assert_index_equal(the_union, expected) def test_intersection(self): rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) @@ -270,24 +273,24 @@ def test_intersection(self): rng2 = rng[:25] the_int = rng1.intersection(rng2) expected = rng[10:25] - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) tm.assertIsInstance(the_int, DatetimeIndex) self.assertEqual(the_int.offset, rng.offset) the_int = rng1.intersection(rng2.view(DatetimeIndex)) - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) # non-overlapping the_int = rng[:10].intersection(rng[10:]) expected = DatetimeIndex([]) - self.assertTrue(the_int.equals(expected)) + self.assert_index_equal(the_int, expected) def test_intersection_bug(self): # GH #771 a = bdate_range('11/30/2011', '12/31/2011') b = bdate_range('12/10/2011', '12/20/2011') result = a.intersection(b) - self.assertTrue(result.equals(b)) + self.assert_index_equal(result, b) def test_summary(self): self.rng.summary() @@ -364,7 +367,7 @@ def test_range_bug(self): start = datetime(2011, 1, 1) exp_values = [start + i * offset for i in range(5)] - self.assert_numpy_array_equal(result, DatetimeIndex(exp_values)) + tm.assert_index_equal(result, DatetimeIndex(exp_values)) def test_range_tz_pytz(self): # GH 2906 @@ -494,8 +497,8 @@ def test_range_closed(self): if begin == closed[0]: expected_right = closed[1:] - self.assertTrue(expected_left.equals(left)) - self.assertTrue(expected_right.equals(right)) + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) def test_range_closed_with_tz_aware_start_end(self): # GH12409 @@ -514,8 +517,8 @@ def test_range_closed_with_tz_aware_start_end(self): if begin == closed[0]: expected_right = closed[1:] - self.assertTrue(expected_left.equals(left)) - self.assertTrue(expected_right.equals(right)) + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) # test with default frequency, UTC begin = Timestamp('2011/1/1', tz='UTC') @@ -546,9 +549,9 @@ def test_range_closed_boundary(self): expected_right = both_boundary[1:] expected_left = both_boundary[:-1] - self.assertTrue(right_boundary.equals(expected_right)) - self.assertTrue(left_boundary.equals(expected_left)) - self.assertTrue(both_boundary.equals(expected_both)) + self.assert_index_equal(right_boundary, expected_right) + self.assert_index_equal(left_boundary, expected_left) + self.assert_index_equal(both_boundary, expected_both) def test_years_only(self): # GH 6961 @@ -570,8 +573,8 @@ def test_freq_divides_end_in_nanos(self): '2005-01-13 15:45:00'], dtype='datetime64[ns]', freq='345T', tz=None) - self.assertTrue(result_1.equals(expected_1)) - self.assertTrue(result_2.equals(expected_2)) + self.assert_index_equal(result_1, expected_1) + self.assert_index_equal(result_2, expected_2) class TestCustomDateRange(tm.TestCase): @@ -613,7 +616,7 @@ def test_comparison(self): def test_copy(self): cp = self.rng.copy() repr(cp) - self.assertTrue(cp.equals(self.rng)) + self.assert_index_equal(cp, self.rng) def test_repr(self): # only really care that it works @@ -621,7 +624,8 @@ def test_repr(self): def test_getitem(self): smaller = self.rng[:5] - self.assert_numpy_array_equal(smaller, self.rng.view(np.ndarray)[:5]) + exp = DatetimeIndex(self.rng.view(np.ndarray)[:5]) + self.assert_index_equal(smaller, exp) self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] @@ -686,7 +690,7 @@ def test_union(self): tm.assertIsInstance(the_union, DatetimeIndex) # order does not matter - self.assert_numpy_array_equal(right.union(left), the_union) + self.assert_index_equal(right.union(left), the_union) # overlapping, but different offset rng = date_range(START, END, freq=datetools.bmonthEnd) @@ -731,7 +735,7 @@ def test_intersection_bug(self): a = cdate_range('11/30/2011', '12/31/2011') b = cdate_range('12/10/2011', '12/20/2011') result = a.intersection(b) - self.assertTrue(result.equals(b)) + self.assert_index_equal(result, b) def test_summary(self): self.rng.summary() @@ -783,25 +787,25 @@ def test_daterange_bug_456(self): def test_cdaterange(self): rng = cdate_range('2013-05-01', periods=3) xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-03']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_weekmask(self): rng = cdate_range('2013-05-01', periods=3, weekmask='Sun Mon Tue Wed Thu') xp = DatetimeIndex(['2013-05-01', '2013-05-02', '2013-05-05']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_holidays(self): rng = cdate_range('2013-05-01', periods=3, holidays=['2013-05-01']) xp = DatetimeIndex(['2013-05-02', '2013-05-03', '2013-05-06']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) def test_cdaterange_weekmask_and_holidays(self): rng = cdate_range('2013-05-01', periods=3, weekmask='Sun Mon Tue Wed Thu', holidays=['2013-05-01']) xp = DatetimeIndex(['2013-05-02', '2013-05-05', '2013-05-06']) - self.assertTrue(xp.equals(rng)) + self.assert_index_equal(xp, rng) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index 0e91e396965fa..ec88acc421cdb 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -4551,7 +4551,7 @@ def test_all_offset_classes(self): for offset, test_values in iteritems(tests): first = Timestamp(test_values[0], tz='US/Eastern') + offset() second = Timestamp(test_values[1], tz='US/Eastern') - self.assertEqual(first, second, str(offset)) + self.assertEqual(first, second, msg=str(offset)) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index b0df824f0a832..8e6d339b87623 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -26,8 +26,6 @@ from pandas import (Series, DataFrame, _np_version_under1p9, _np_version_under1p12) from pandas import tslib -from pandas.util.testing import (assert_index_equal, assert_series_equal, - assert_almost_equal, assertRaisesRegexp) import pandas.util.testing as tm @@ -1752,22 +1750,21 @@ def test_constructor_simple_new(self): result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new( - [pd.Period('2007-01', freq='M'), pd.Period('2007-02', freq='M')], - 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + result = idx._simple_new([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')], + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) - result = idx._simple_new( - np.array([pd.Period('2007-01', freq='M'), - pd.Period('2007-02', freq='M')]), - 'p', freq=idx.freq) - self.assertTrue(result.equals(idx)) + result = idx._simple_new(np.array([pd.Period('2007-01', freq='M'), + pd.Period('2007-02', freq='M')]), + 'p', freq=idx.freq) + self.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 idx = PeriodIndex([], freq='M', name='p') result = idx._simple_new(idx, name='p', freq='M') - assert_index_equal(result, idx) + tm.assert_index_equal(result, idx) def test_constructor_simple_new_floats(self): # GH13079 @@ -1782,7 +1779,7 @@ def test_shallow_copy_empty(self): result = idx._shallow_copy() expected = idx - assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) def test_constructor_nat(self): self.assertRaises(ValueError, period_range, start='NaT', @@ -1902,7 +1899,7 @@ def test_getitem_partial(self): exp = result result = ts[24:] - assert_series_equal(exp, result) + tm.assert_series_equal(exp, result) ts = ts[10:].append(ts[10:]) self.assertRaisesRegexp(KeyError, @@ -1918,7 +1915,7 @@ def test_getitem_datetime(self): dt4 = datetime(2012, 4, 20) rs = ts[dt1:dt4] - assert_series_equal(rs, ts) + tm.assert_series_equal(rs, ts) def test_slice_with_negative_step(self): ts = Series(np.arange(20), @@ -1926,9 +1923,9 @@ def test_slice_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(ts[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) - assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.loc[l_slc], ts.iloc[i_slc]) + tm.assert_series_equal(ts.ix[l_slc], ts.iloc[i_slc]) assert_slices_equivalent(SLC[Period('2014-10')::-1], SLC[9::-1]) assert_slices_equivalent(SLC['2014-10'::-1], SLC[9::-1]) @@ -2100,13 +2097,13 @@ def test_as_frame_columns(self): df = DataFrame(randn(10, 5), columns=rng) ts = df[rng[0]] - assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.ix[:, 0]) # GH # 1211 repr(df) ts = df['1/1/2000'] - assert_series_equal(ts, df.ix[:, 0]) + tm.assert_series_equal(ts, df.ix[:, 0]) def test_indexing(self): @@ -2151,7 +2148,7 @@ def test_frame_to_time_stamp(self): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end') tm.assert_index_equal(result.index, exp_index) - assert_almost_equal(result.values, df.values) + tm.assert_numpy_array_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start') @@ -2182,7 +2179,7 @@ def _get_with_delta(delta, freq='A-DEC'): exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') result = df.to_timestamp('D', 'end', axis=1) tm.assert_index_equal(result.columns, exp_index) - assert_almost_equal(result.values, df.values) + tm.assert_numpy_array_equal(result.values, df.values) exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-JAN') result = df.to_timestamp('D', 'start', axis=1) @@ -2204,7 +2201,7 @@ def _get_with_delta(delta, freq='A-DEC'): tm.assert_index_equal(result.columns, exp_index) # invalid axis - assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) + tm.assertRaisesRegexp(ValueError, 'axis', df.to_timestamp, axis=2) result1 = df.to_timestamp('5t', axis=1) result2 = df.to_timestamp('t', axis=1) @@ -2224,7 +2221,7 @@ def test_index_duplicate_periods(self): result = ts[2007] expected = ts[1:3] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result[:] = 1 self.assertTrue((ts[1:3] == 1).all()) @@ -2234,19 +2231,19 @@ def test_index_duplicate_periods(self): result = ts[2007] expected = ts[idx == 2007] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_index_unique(self): idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN') - self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assert_index_equal(idx.unique(), expected) self.assertEqual(idx.nunique(), 3) idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN', tz='US/Eastern') expected = PeriodIndex([2000, 2007, 2009], freq='A-JUN', tz='US/Eastern') - self.assert_numpy_array_equal(idx.unique(), expected.values) + self.assert_index_equal(idx.unique(), expected) self.assertEqual(idx.nunique(), 3) def test_constructor(self): @@ -2336,20 +2333,17 @@ def test_repeat(self): Period('2001-01-02'), Period('2001-01-02'), ]) - assert_index_equal(index.repeat(2), expected) + tm.assert_index_equal(index.repeat(2), expected) def test_numpy_repeat(self): index = period_range('20010101', periods=2) - expected = PeriodIndex([ - Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02'), - ]) + expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), + Period('2001-01-02'), Period('2001-01-02')]) - assert_index_equal(np.repeat(index, 2), expected) + tm.assert_index_equal(np.repeat(index, 2), expected) msg = "the 'axis' parameter is not supported" - assertRaisesRegexp(ValueError, msg, np.repeat, - index, 2, axis=1) + tm.assertRaisesRegexp(ValueError, msg, np.repeat, index, 2, axis=1) def test_shift(self): pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -2598,7 +2592,7 @@ def test_negative_ordinals(self): idx1 = PeriodIndex(ordinal=[-1, 0, 1], freq='A') idx2 = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') - tm.assert_numpy_array_equal(idx1, idx2) + tm.assert_index_equal(idx1, idx2) def test_dti_to_period(self): dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') @@ -2626,10 +2620,10 @@ def test_pindex_slice_index(self): s = Series(np.random.rand(len(pi)), index=pi) res = s['2010'] exp = s[0:12] - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) res = s['2011'] exp = s[12:24] - assert_series_equal(res, exp) + tm.assert_series_equal(res, exp) def test_getitem_day(self): # GH 6716 @@ -2655,9 +2649,9 @@ def test_getitem_day(self): continue s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01'], s[0:31]) - assert_series_equal(s['2013/02'], s[31:59]) - assert_series_equal(s['2014'], s[365:]) + tm.assert_series_equal(s['2013/01'], s[0:31]) + tm.assert_series_equal(s['2013/02'], s[31:59]) + tm.assert_series_equal(s['2014'], s[365:]) invalid = ['2013/02/01 9H', '2013/02/01 09:00'] for v in invalid: @@ -2683,10 +2677,10 @@ def test_range_slice_day(self): s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/02':], s[1:]) - assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) - assert_series_equal(s['2013/02':], s[31:]) - assert_series_equal(s['2014':], s[365:]) + tm.assert_series_equal(s['2013/01/02':], s[1:]) + tm.assert_series_equal(s['2013/01/02':'2013/01/05'], s[1:5]) + tm.assert_series_equal(s['2013/02':], s[31:]) + tm.assert_series_equal(s['2014':], s[365:]) invalid = ['2013/02/01 9H', '2013/02/01 09:00'] for v in invalid: @@ -2716,10 +2710,10 @@ def test_getitem_seconds(self): continue s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) - assert_series_equal(s['2013/01/01 9H'], s[:3600]) + tm.assert_series_equal(s['2013/01/01 10:00'], s[3600:3660]) + tm.assert_series_equal(s['2013/01/01 9H'], s[:3600]) for d in ['2013/01/01', '2013/01', '2013']: - assert_series_equal(s[d], s) + tm.assert_series_equal(s[d], s) def test_range_slice_seconds(self): # GH 6716 @@ -2741,14 +2735,14 @@ def test_range_slice_seconds(self): s = Series(np.random.rand(len(idx)), index=idx) - assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], - s[300:660]) - assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], - s[3600:3960]) - assert_series_equal(s['2013/01/01 10H':], s[3600:]) - assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) + tm.assert_series_equal(s['2013/01/01 09:05':'2013/01/01 09:10'], + s[300:660]) + tm.assert_series_equal(s['2013/01/01 10:00':'2013/01/01 10:05'], + s[3600:3960]) + tm.assert_series_equal(s['2013/01/01 10H':], s[3600:]) + tm.assert_series_equal(s[:'2013/01/01 09:30'], s[:1860]) for d in ['2013/01/01', '2013/01', '2013']: - assert_series_equal(s[d:], s) + tm.assert_series_equal(s[d:], s) def test_range_slice_outofbounds(self): # GH 5407 @@ -2757,8 +2751,8 @@ def test_range_slice_outofbounds(self): for idx in [didx, pidx]: df = DataFrame(dict(units=[100 + i for i in range(10)]), index=idx) - empty = DataFrame(index=idx.__class__( - [], freq='D'), columns=['units']) + empty = DataFrame(index=idx.__class__([], freq='D'), + columns=['units']) empty['units'] = empty['units'].astype('int64') tm.assert_frame_equal(df['2013/09/01':'2013/09/30'], empty) @@ -2949,16 +2943,16 @@ def test_align_series(self): result = ts + ts[::2] expected = ts + ts expected[1::2] = np.nan - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = ts + _permute(ts[::2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # it works! for kind in ['inner', 'outer', 'left', 'right']: ts.align(ts[::2], join=kind) msg = "Input has different freq=D from PeriodIndex\\(freq=A-DEC\\)" - with assertRaisesRegexp(period.IncompatibleFrequency, msg): + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): ts + ts.asfreq('D', how="end") def test_align_frame(self): @@ -3158,7 +3152,7 @@ def test_map(self): tm.assert_index_equal(result, expected) result = index.map(lambda x: x.ordinal) - exp = [x.ordinal for x in index] + exp = np.array([x.ordinal for x in index], dtype=np.int64) tm.assert_numpy_array_equal(result, exp) def test_map_with_string_constructor(self): @@ -4231,19 +4225,19 @@ def test_constructor_cast_object(self): def test_series_comparison_scalars(self): val = pd.Period('2000-01-04', freq='D') result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) val = self.series[5] result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = pd.Series([x > val for x in self.series]) + tm.assert_series_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] result = self.series.between(left, right) expected = (self.series >= left) & (self.series <= right) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # --------------------------------------------------------------------- # NaT support @@ -4262,7 +4256,7 @@ def test_NaT_scalar(self): def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') expected = Series([NaT]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) """ def test_set_none_nan(self): diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py index 67df62e1ebb57..2255f9fae73de 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tseries/tests/test_plotting.py @@ -330,7 +330,7 @@ def test_dataframe(self): bts = DataFrame({'a': tm.makeTimeSeries()}) ax = bts.plot() idx = ax.get_lines()[0].get_xdata() - tm.assert_numpy_array_equal(bts.index.to_period(), PeriodIndex(idx)) + tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) @slow def test_axis_limits(self): @@ -1113,7 +1113,7 @@ def test_ax_plot(self): fig = plt.figure() ax = fig.add_subplot(111) lines = ax.plot(x, y, label='Y') - tm.assert_numpy_array_equal(DatetimeIndex(lines[0].get_xdata()), x) + tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @slow def test_mpl_nopandas(self): diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 6b94c828bddc0..2236d20975eee 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -1418,7 +1418,7 @@ def test_resample_base(self): resampled = ts.resample('5min', base=2).mean() exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57', freq='5min') - self.assertTrue(resampled.index.equals(exp_rng)) + self.assert_index_equal(resampled.index, exp_rng) def test_resample_base_with_timedeltaindex(self): @@ -1432,8 +1432,8 @@ def test_resample_base_with_timedeltaindex(self): exp_without_base = timedelta_range(start='0s', end='25s', freq='2s') exp_with_base = timedelta_range(start='5s', end='29s', freq='2s') - self.assertTrue(without_base.index.equals(exp_without_base)) - self.assertTrue(with_base.index.equals(exp_with_base)) + self.assert_index_equal(without_base.index, exp_without_base) + self.assert_index_equal(with_base.index, exp_with_base) def test_resample_categorical_data_with_timedeltaindex(self): # GH #12169 @@ -1464,7 +1464,7 @@ def test_resample_to_period_monthly_buglet(self): result = ts.resample('M', kind='period').mean() exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') - self.assertTrue(result.index.equals(exp_index)) + self.assert_index_equal(result.index, exp_index) def test_period_with_agg(self): @@ -1627,7 +1627,7 @@ def test_corner_cases(self): result = ts.resample('5t', closed='right', label='left').mean() ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0] # it works @@ -2391,7 +2391,7 @@ def test_closed_left_corner(self): ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) assert_series_equal(result, exp) def test_quarterly_resampling(self): @@ -2760,7 +2760,7 @@ def test_apply_iteration(self): # it works! result = grouped.apply(f) - self.assertTrue(result.index.equals(df.index)) + self.assert_index_equal(result.index, df.index) def test_panel_aggregation(self): ind = pd.date_range('1/1/2000', periods=100) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 20098488f7f1c..10276137b42a1 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1223,7 +1223,7 @@ def test_total_seconds(self): freq='s') expt = [1 * 86400 + 10 * 3600 + 11 * 60 + 12 + 100123456. / 1e9, 1 * 86400 + 10 * 3600 + 11 * 60 + 13 + 100123456. / 1e9] - tm.assert_almost_equal(rng.total_seconds(), expt) + tm.assert_almost_equal(rng.total_seconds(), np.array(expt)) # test Series s = Series(rng) @@ -1288,7 +1288,7 @@ def test_constructor(self): def test_constructor_coverage(self): rng = timedelta_range('1 days', periods=10.5) exp = timedelta_range('1 days', periods=10) - self.assertTrue(rng.equals(exp)) + self.assert_index_equal(rng, exp) self.assertRaises(ValueError, TimedeltaIndex, start='1 days', periods='foo', freq='D') @@ -1302,16 +1302,16 @@ def test_constructor_coverage(self): gen = (timedelta(i) for i in range(10)) result = TimedeltaIndex(gen) expected = TimedeltaIndex([timedelta(i) for i in range(10)]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) # NumPy string array strings = np.array(['1 days', '2 days', '3 days']) result = TimedeltaIndex(strings) expected = to_timedelta([1, 2, 3], unit='d') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) from_ints = TimedeltaIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + self.assert_index_equal(from_ints, expected) # non-conforming freq self.assertRaises(ValueError, TimedeltaIndex, @@ -1438,7 +1438,7 @@ def test_map(self): f = lambda x: x.days result = rng.map(f) - exp = [f(x) for x in rng] + exp = np.array([f(x) for x in rng], dtype=np.int64) self.assert_numpy_array_equal(result, exp) def test_misc_coverage(self): @@ -1459,7 +1459,7 @@ def test_union(self): i2 = timedelta_range('3day', periods=5) result = i1.union(i2) expected = timedelta_range('1day', periods=7) - self.assert_numpy_array_equal(result, expected) + self.assert_index_equal(result, expected) i1 = Int64Index(np.arange(0, 20, 2)) i2 = TimedeltaIndex(start='1 day', periods=10, freq='D') @@ -1471,10 +1471,10 @@ def test_union_coverage(self): idx = TimedeltaIndex(['3d', '1d', '2d']) ordered = TimedeltaIndex(idx.sort_values(), freq='infer') result = ordered.union(idx) - self.assertTrue(result.equals(ordered)) + self.assert_index_equal(result, ordered) result = ordered[:0].union(ordered) - self.assertTrue(result.equals(ordered)) + self.assert_index_equal(result, ordered) self.assertEqual(result.freq, ordered.freq) def test_union_bug_1730(self): @@ -1484,18 +1484,18 @@ def test_union_bug_1730(self): result = rng_a.union(rng_b) exp = TimedeltaIndex(sorted(set(list(rng_a)) | set(list(rng_b)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_union_bug_1745(self): left = TimedeltaIndex(['1 day 15:19:49.695000']) - right = TimedeltaIndex( - ['2 day 13:04:21.322000', '1 day 15:27:24.873000', - '1 day 15:31:05.350000']) + right = TimedeltaIndex(['2 day 13:04:21.322000', + '1 day 15:27:24.873000', + '1 day 15:31:05.350000']) result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_union_bug_4564(self): @@ -1504,7 +1504,7 @@ def test_union_bug_4564(self): result = left.union(right) exp = TimedeltaIndex(sorted(set(list(left)) | set(list(right)))) - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) def test_intersection_bug_1708(self): index_1 = timedelta_range('1 day', periods=4, freq='h') @@ -1526,7 +1526,7 @@ def test_get_duplicates(self): result = idx.get_duplicates() ex = TimedeltaIndex(['2 day', '3day']) - self.assertTrue(result.equals(ex)) + self.assert_index_equal(result, ex) def test_argmin_argmax(self): idx = TimedeltaIndex(['1 day 00:00:05', '1 day 00:00:01', @@ -1546,11 +1546,13 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) self.assertTrue(ordered.is_monotonic) - self.assert_numpy_array_equal(dexer, [1, 2, 0]) + self.assert_numpy_array_equal(dexer, + np.array([1, 2, 0], dtype=np.int64)) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) self.assertTrue(ordered[::-1].is_monotonic) - self.assert_numpy_array_equal(dexer, [0, 2, 1]) + self.assert_numpy_array_equal(dexer, + np.array([0, 2, 1], dtype=np.int64)) def test_insert(self): @@ -1558,7 +1560,7 @@ def test_insert(self): result = idx.insert(2, timedelta(days=5)) exp = TimedeltaIndex(['4day', '1day', '5day', '2day'], name='idx') - self.assertTrue(result.equals(exp)) + self.assert_index_equal(result, exp) # insertion of non-datetime should coerce to object index result = idx.insert(1, 'inserted') @@ -1594,7 +1596,7 @@ def test_insert(self): for n, d, expected in cases: result = idx.insert(n, d) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1618,7 +1620,7 @@ def test_delete(self): 1: expected_1} for n, expected in compat.iteritems(cases): result = idx.delete(n) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1645,12 +1647,12 @@ def test_delete_slice(self): (3, 4, 5): expected_3_5} for n, expected in compat.iteritems(cases): result = idx.delete(n) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) result = idx.delete(slice(n[0], n[-1] + 1)) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertEqual(result.name, expected.name) self.assertEqual(result.freq, expected.freq) @@ -1664,7 +1666,7 @@ def test_take(self): taken2 = idx[[2, 4, 10]] for taken in [taken1, taken2]: - self.assertTrue(taken.equals(expected)) + self.assert_index_equal(taken, expected) tm.assertIsInstance(taken, TimedeltaIndex) self.assertIsNone(taken.freq) self.assertEqual(taken.name, expected.name) @@ -1711,7 +1713,7 @@ def test_isin(self): self.assertTrue(result.all()) assert_almost_equal(index.isin([index[2], 5]), - [False, False, True, False]) + np.array([False, False, True, False])) def test_does_not_convert_mixed_integer(self): df = tm.makeCustomDataframe(10, 10, @@ -1748,18 +1750,18 @@ def test_factorize(self): arr, idx = idx1.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + self.assert_index_equal(idx, exp_idx) arr, idx = idx1.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(exp_idx)) + self.assert_index_equal(idx, exp_idx) # freq must be preserved idx3 = timedelta_range('1 day', periods=4, freq='s') exp_arr = np.array([0, 1, 2, 3]) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) - self.assertTrue(idx.equals(idx3)) + self.assert_index_equal(idx, idx3) class TestSlicing(tm.TestCase): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 3a3315ed3890c..f6d80f7ee410b 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -59,7 +59,7 @@ def test_index_unique(self): expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5)]) self.assertEqual(uniques.dtype, 'M8[ns]') # sanity - self.assertTrue(uniques.equals(expected)) + tm.assert_index_equal(uniques, expected) self.assertEqual(self.dups.index.nunique(), 4) # #2563 @@ -68,22 +68,23 @@ def test_index_unique(self): dups_local = self.dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() - expected = DatetimeIndex(expected).tz_localize('US/Eastern') + expected = DatetimeIndex(expected, name='foo') + expected = expected.tz_localize('US/Eastern') self.assertTrue(result.tz is not None) self.assertEqual(result.name, 'foo') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # NaT, note this is excluded arr = [1370745748 + t for t in range(20)] + [iNaT] idx = DatetimeIndex(arr * 3) - self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) self.assertEqual(idx.nunique(), 20) self.assertEqual(idx.nunique(dropna=False), 21) arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20)] + [NaT] idx = DatetimeIndex(arr * 3) - self.assertTrue(idx.unique().equals(DatetimeIndex(arr))) + tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) self.assertEqual(idx.nunique(), 20) self.assertEqual(idx.nunique(dropna=False), 21) @@ -284,12 +285,12 @@ def test_recreate_from_data(self): for f in freqs: org = DatetimeIndex(start='2001/02/01 09:00', freq=f, periods=1) idx = DatetimeIndex(org, freq=f) - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) org = DatetimeIndex(start='2001/02/01 09:00', freq=f, tz='US/Pacific', periods=1) idx = DatetimeIndex(org, freq=f, tz='US/Pacific') - self.assertTrue(idx.equals(org)) + tm.assert_index_equal(idx, org) def assert_range_equal(left, right): @@ -874,7 +875,7 @@ def test_string_na_nat_conversion(self): result2 = to_datetime(strings) tm.assertIsInstance(result2, DatetimeIndex) - tm.assert_numpy_array_equal(result, result2) + tm.assert_numpy_array_equal(result, result2.values) malformed = np.array(['1/100/2000', np.nan], dtype=object) @@ -1065,7 +1066,7 @@ def test_to_datetime_list_of_integers(self): result = DatetimeIndex(ints) - self.assertTrue(rng.equals(result)) + tm.assert_index_equal(rng, result) def test_to_datetime_freq(self): xp = bdate_range('2000-1-1', periods=10, tz='UTC') @@ -1162,15 +1163,15 @@ def test_date_range_gen_error(self): def test_date_range_negative_freq(self): # GH 11018 rng = date_range('2011-12-31', freq='-2A', periods=3) - exp = pd.DatetimeIndex( - ['2011-12-31', '2009-12-31', '2007-12-31'], freq='-2A') - self.assert_index_equal(rng, exp) + exp = pd.DatetimeIndex(['2011-12-31', '2009-12-31', + '2007-12-31'], freq='-2A') + tm.assert_index_equal(rng, exp) self.assertEqual(rng.freq, '-2A') rng = date_range('2011-01-31', freq='-2M', periods=3) - exp = pd.DatetimeIndex( - ['2011-01-31', '2010-11-30', '2010-09-30'], freq='-2M') - self.assert_index_equal(rng, exp) + exp = pd.DatetimeIndex(['2011-01-31', '2010-11-30', + '2010-09-30'], freq='-2M') + tm.assert_index_equal(rng, exp) self.assertEqual(rng.freq, '-2M') def test_date_range_bms_bug(self): @@ -1523,7 +1524,7 @@ def test_normalize(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D') - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) rng_ns = pd.DatetimeIndex(np.array([1380585623454345752, 1380585612343234312]).astype( @@ -1532,7 +1533,7 @@ def test_normalize(self): expected = pd.DatetimeIndex(np.array([1380585600000000000, 1380585600000000000]).astype( "datetime64[ns]")) - self.assertTrue(rng_ns_normalized.equals(expected)) + tm.assert_index_equal(rng_ns_normalized, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1549,7 +1550,7 @@ def test_to_period(self): pts = ts.to_period('M') exp.index = exp.index.asfreq('M') - self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) assert_series_equal(pts, exp) # GH 7606 without freq @@ -1607,7 +1608,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=UTC) @@ -1615,7 +1616,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1623,7 +1624,7 @@ def test_to_period_tz_pytz(self): expected = ts[0].to_period() self.assertEqual(result, expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_to_period_tz_explicit_pytz(self): tm._skip_if_no_pytz() @@ -1638,7 +1639,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=pytz.utc) @@ -1646,7 +1647,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1654,7 +1655,7 @@ def test_to_period_tz_explicit_pytz(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_to_period_tz_dateutil(self): tm._skip_if_no_dateutil() @@ -1669,7 +1670,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=dateutil.tz.tzutc()) @@ -1677,7 +1678,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) ts = date_range('1/1/2000', '4/1/2000', tz=tzlocal()) @@ -1685,7 +1686,7 @@ def test_to_period_tz_dateutil(self): expected = ts[0].to_period() self.assertTrue(result == expected) - self.assertTrue(ts.to_period().equals(xp)) + tm.assert_index_equal(ts.to_period(), xp) def test_frame_to_period(self): K = 5 @@ -1702,7 +1703,7 @@ def test_frame_to_period(self): assert_frame_equal(pts, exp) pts = df.to_period('M') - self.assertTrue(pts.index.equals(exp.index.asfreq('M'))) + tm.assert_index_equal(pts.index, exp.index.asfreq('M')) df = df.T pts = df.to_period(axis=1) @@ -1711,7 +1712,7 @@ def test_frame_to_period(self): assert_frame_equal(pts, exp) pts = df.to_period('M', axis=1) - self.assertTrue(pts.columns.equals(exp.columns.asfreq('M'))) + tm.assert_index_equal(pts.columns, exp.columns.asfreq('M')) self.assertRaises(ValueError, df.to_period, axis=2) @@ -1799,11 +1800,11 @@ def test_datetimeindex_integers_shift(self): result = rng + 5 expected = rng.shift(5) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) result = rng - 5 expected = rng.shift(-5) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_astype_object(self): # NumPy 1.6.1 weak ns support @@ -1812,7 +1813,8 @@ def test_astype_object(self): casted = rng.astype('O') exp_values = list(rng) - self.assert_numpy_array_equal(casted, exp_values) + tm.assert_index_equal(casted, Index(exp_values, dtype=np.object_)) + self.assertEqual(casted.tolist(), exp_values) def test_catch_infinite_loop(self): offset = datetools.DateOffset(minute=5) @@ -1828,15 +1830,15 @@ def test_append_concat(self): result = ts.append(ts) result_df = df.append(df) ex_index = DatetimeIndex(np.tile(rng.values, 2)) - self.assertTrue(result.index.equals(ex_index)) - self.assertTrue(result_df.index.equals(ex_index)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) appended = rng.append(rng) - self.assertTrue(appended.equals(ex_index)) + tm.assert_index_equal(appended, ex_index) appended = rng.append([rng, rng]) ex_index = DatetimeIndex(np.tile(rng.values, 3)) - self.assertTrue(appended.equals(ex_index)) + tm.assert_index_equal(appended, ex_index) # different index names rng1 = rng.copy() @@ -1863,11 +1865,11 @@ def test_append_concat_tz(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_append_concat_tz_explicit_pytz(self): # GH 2938 @@ -1887,11 +1889,11 @@ def test_append_concat_tz_explicit_pytz(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_append_concat_tz_dateutil(self): # GH 2938 @@ -1909,11 +1911,11 @@ def test_append_concat_tz_dateutil(self): result = ts.append(ts2) result_df = df.append(df2) - self.assertTrue(result.index.equals(rng3)) - self.assertTrue(result_df.index.equals(rng3)) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) appended = rng.append(rng2) - self.assertTrue(appended.equals(rng3)) + tm.assert_index_equal(appended, rng3) def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) @@ -2440,13 +2442,13 @@ def test_index_to_datetime(self): result = idx.to_datetime() expected = DatetimeIndex(datetools.to_datetime(idx.values)) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) today = datetime.today() idx = Index([today], dtype=object) result = idx.to_datetime() expected = DatetimeIndex([today]) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_dataframe(self): @@ -2596,14 +2598,14 @@ def test_to_period_nofreq(self): idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], freq='infer') self.assertEqual(idx.freqstr, 'D') - expected = pd.PeriodIndex( - ['2000-01-01', '2000-01-02', '2000-01-03'], freq='D') - self.assertTrue(idx.to_period().equals(expected)) + expected = pd.PeriodIndex(['2000-01-01', '2000-01-02', + '2000-01-03'], freq='D') + tm.assert_index_equal(idx.to_period(), expected) # GH 7606 idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03']) self.assertEqual(idx.freqstr, None) - self.assertTrue(idx.to_period().equals(expected)) + tm.assert_index_equal(idx.to_period(), expected) def test_000constructor_resolution(self): # 2252 @@ -2615,7 +2617,7 @@ def test_000constructor_resolution(self): def test_constructor_coverage(self): rng = date_range('1/1/2000', periods=10.5) exp = date_range('1/1/2000', periods=10) - self.assertTrue(rng.equals(exp)) + tm.assert_index_equal(rng, exp) self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', periods='foo', freq='D') @@ -2630,25 +2632,25 @@ def test_constructor_coverage(self): result = DatetimeIndex(gen) expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) for i in range(10)]) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # NumPy string array strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + tm.assert_index_equal(from_ints, expected) # string with NaT strings = np.array(['2000-01-01', '2000-01-02', 'NaT']) result = DatetimeIndex(strings) expected = DatetimeIndex(strings.astype('O')) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) from_ints = DatetimeIndex(expected.asi8) - self.assertTrue(from_ints.equals(expected)) + tm.assert_index_equal(from_ints, expected) # non-conforming self.assertRaises(ValueError, DatetimeIndex, @@ -2715,17 +2717,15 @@ def test_constructor_datetime64_tzformat(self): def test_constructor_dtype(self): # passing a dtype with a tz should localize - idx = DatetimeIndex(['2013-01-01', - '2013-01-02'], + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], dtype='datetime64[ns, US/Eastern]') expected = DatetimeIndex(['2013-01-01', '2013-01-02'] ).tz_localize('US/Eastern') - self.assertTrue(idx.equals(expected)) + tm.assert_index_equal(idx, expected) - idx = DatetimeIndex(['2013-01-01', - '2013-01-02'], + idx = DatetimeIndex(['2013-01-01', '2013-01-02'], tz='US/Eastern') - self.assertTrue(idx.equals(expected)) + tm.assert_index_equal(idx, expected) # if we already have a tz and its not the same, then raise idx = DatetimeIndex(['2013-01-01', '2013-01-02'], @@ -2744,7 +2744,7 @@ def test_constructor_dtype(self): idx, tz='CET', dtype='datetime64[ns, US/Eastern]')) result = DatetimeIndex(idx, dtype='datetime64[ns, US/Eastern]') - self.assertTrue(idx.equals(result)) + tm.assert_index_equal(idx, result) def test_constructor_name(self): idx = DatetimeIndex(start='2000-01-01', periods=1, freq='A', @@ -2860,7 +2860,7 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) - exp = [f(x) for x in rng] + exp = np.array([f(x) for x in rng], dtype=' val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) val = self.series[5] result = self.series > val - expected = np.array([x > val for x in self.series]) - self.assert_numpy_array_equal(result, expected) + expected = Series([x > val for x in self.series]) + self.assert_series_equal(result, expected) def test_between(self): left, right = self.series[[2, 7]] @@ -4775,10 +4777,9 @@ def test_date_range_normalize(self): rng = date_range(snap, periods=n, normalize=False, freq='2D') offset = timedelta(2) - values = np.array([snap + i * offset for i in range(n)], - dtype='M8[ns]') + values = DatetimeIndex([snap + i * offset for i in range(n)]) - self.assert_numpy_array_equal(rng, values) + tm.assert_index_equal(rng, values) rng = date_range('1/1/2000 08:15', periods=n, normalize=False, freq='B') @@ -4797,7 +4798,7 @@ def test_timedelta(self): result = index - timedelta(1) expected = index + timedelta(-1) - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) # GH4134, buggy with timedeltas rng = date_range('2013', '2014') @@ -4806,8 +4807,8 @@ def test_timedelta(self): result2 = DatetimeIndex(s - np.timedelta64(100000000)) result3 = rng - np.timedelta64(100000000) result4 = DatetimeIndex(s - pd.offsets.Hour(1)) - self.assertTrue(result1.equals(result4)) - self.assertTrue(result2.equals(result3)) + tm.assert_index_equal(result1, result4) + tm.assert_index_equal(result2, result3) def test_shift(self): ts = Series(np.random.randn(5), @@ -4815,12 +4816,12 @@ def test_shift(self): result = ts.shift(1, freq='5T') exp_index = ts.index.shift(1, freq='5T') - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) # GH #1063, multiple of same base result = ts.shift(1, freq='4H') exp_index = ts.index + datetools.Hour(4) - self.assertTrue(result.index.equals(exp_index)) + tm.assert_index_equal(result.index, exp_index) idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) self.assertRaises(ValueError, idx.shift, 1) @@ -4972,7 +4973,7 @@ def test_to_datetime_format(self): elif isinstance(expected, Timestamp): self.assertEqual(result, expected) else: - self.assertTrue(result.equals(expected)) + tm.assert_index_equal(result, expected) def test_to_datetime_format_YYYYMMDD(self): s = Series([19801222, 19801222] + [19810105] * 5) @@ -5003,9 +5004,10 @@ def test_to_datetime_format_YYYYMMDD(self): # GH 7930 s = Series([20121231, 20141231, 99991231]) result = pd.to_datetime(s, format='%Y%m%d', errors='ignore') - expected = np.array([datetime(2012, 12, 31), datetime( - 2014, 12, 31), datetime(9999, 12, 31)], dtype=object) - self.assert_numpy_array_equal(result, expected) + expected = Series([datetime(2012, 12, 31), + datetime(2014, 12, 31), datetime(9999, 12, 31)], + dtype=object) + self.assert_series_equal(result, expected) result = pd.to_datetime(s, format='%Y%m%d', errors='coerce') expected = Series(['20121231', '20141231', 'NaT'], dtype='M8[ns]') @@ -5092,18 +5094,13 @@ def test_to_datetime_format_weeks(self): class TestToDatetimeInferFormat(tm.TestCase): def test_to_datetime_infer_datetime_format_consistent_format(self): - time_series = pd.Series(pd.date_range('20000101', periods=50, - freq='H')) + s = pd.Series(pd.date_range('20000101', periods=50, freq='H')) - test_formats = [ - '%m-%d-%Y', - '%m/%d/%Y %H:%M:%S.%f', - '%Y-%m-%dT%H:%M:%S.%f', - ] + test_formats = ['%m-%d-%Y', '%m/%d/%Y %H:%M:%S.%f', + '%Y-%m-%dT%H:%M:%S.%f'] for test_format in test_formats: - s_as_dt_strings = time_series.apply( - lambda x: x.strftime(test_format)) + s_as_dt_strings = s.apply(lambda x: x.strftime(test_format)) with_format = pd.to_datetime(s_as_dt_strings, format=test_format) no_infer = pd.to_datetime(s_as_dt_strings, @@ -5113,70 +5110,45 @@ def test_to_datetime_infer_datetime_format_consistent_format(self): # Whether the format is explicitly passed, it is inferred, or # it is not inferred, the results should all be the same - self.assert_numpy_array_equal(with_format, no_infer) - self.assert_numpy_array_equal(no_infer, yes_infer) + self.assert_series_equal(with_format, no_infer) + self.assert_series_equal(no_infer, yes_infer) def test_to_datetime_infer_datetime_format_inconsistent_format(self): - test_series = pd.Series(np.array([ - '01/01/2011 00:00:00', - '01-02-2011 00:00:00', - '2011-01-03T00:00:00', - ])) + s = pd.Series(np.array(['01/01/2011 00:00:00', + '01-02-2011 00:00:00', + '2011-01-03T00:00:00'])) # When the format is inconsistent, infer_datetime_format should just # fallback to the default parsing - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) - test_series = pd.Series(np.array([ - 'Jan/01/2011', - 'Feb/01/2011', - 'Mar/01/2011', - ])) + s = pd.Series(np.array(['Jan/01/2011', 'Feb/01/2011', 'Mar/01/2011'])) - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_infer_datetime_format_series_with_nans(self): - test_series = pd.Series(np.array([ - '01/01/2011 00:00:00', - np.nan, - '01/03/2011 00:00:00', - np.nan, - ])) - - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + s = pd.Series(np.array(['01/01/2011 00:00:00', np.nan, + '01/03/2011 00:00:00', np.nan])) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_infer_datetime_format_series_starting_with_nans(self): - test_series = pd.Series(np.array([ - np.nan, - np.nan, - '01/01/2011 00:00:00', - '01/02/2011 00:00:00', - '01/03/2011 00:00:00', - ])) + s = pd.Series(np.array([np.nan, np.nan, '01/01/2011 00:00:00', + '01/02/2011 00:00:00', '01/03/2011 00:00:00'])) - self.assert_numpy_array_equal( - pd.to_datetime(test_series, infer_datetime_format=False), - pd.to_datetime(test_series, infer_datetime_format=True) - ) + tm.assert_series_equal(pd.to_datetime(s, infer_datetime_format=False), + pd.to_datetime(s, infer_datetime_format=True)) def test_to_datetime_iso8601_noleading_0s(self): # GH 11871 - test_series = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) + s = pd.Series(['2014-1-1', '2014-2-2', '2015-3-3']) expected = pd.Series([pd.Timestamp('2014-01-01'), pd.Timestamp('2014-02-02'), pd.Timestamp('2015-03-03')]) - tm.assert_series_equal(pd.to_datetime(test_series), expected) - tm.assert_series_equal(pd.to_datetime(test_series, format='%Y-%m-%d'), - expected) + tm.assert_series_equal(pd.to_datetime(s), expected) + tm.assert_series_equal(pd.to_datetime(s, format='%Y-%m-%d'), expected) class TestGuessDatetimeFormat(tm.TestCase): diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index 086f23cd2d4fd..6f58ad3a57b48 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -85,7 +85,7 @@ def test_unpickle_legacy_len0_daterange(self): ex_index = DatetimeIndex([], freq='B') - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) tm.assertIsInstance(result.index.freq, offsets.BDay) self.assertEqual(len(result), 0) @@ -116,7 +116,7 @@ def _check_join(left, right, how='inner'): return_indexers=True) tm.assertIsInstance(ra, DatetimeIndex) - self.assertTrue(ra.equals(ea)) + self.assert_index_equal(ra, ea) assert_almost_equal(rb, eb) assert_almost_equal(rc, ec) @@ -150,24 +150,24 @@ def test_setops(self): result = index[:5].union(obj_index[5:]) expected = index tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index[:10].intersection(obj_index[5:]) expected = index[5:10] tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index[:10] - obj_index[5:] expected = index[:5] tm.assertIsInstance(result, DatetimeIndex) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_index_conversion(self): index = self.frame.index obj_index = index.asobject conv = DatetimeIndex(obj_index) - self.assertTrue(conv.equals(index)) + self.assert_index_equal(conv, index) self.assertRaises(ValueError, DatetimeIndex, ['a', 'b', 'c', 'd']) @@ -188,11 +188,11 @@ def test_setops_conversion_fail(self): result = index.union(right) expected = Index(np.concatenate([index.asobject, right])) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) result = index.intersection(right) expected = Index([]) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_legacy_time_rules(self): rules = [('WEEKDAY', 'B'), ('EOM', 'BM'), ('W@MON', 'W-MON'), @@ -211,7 +211,7 @@ def test_legacy_time_rules(self): for old_freq, new_freq in rules: old_rng = date_range(start, end, freq=old_freq) new_rng = date_range(start, end, freq=new_freq) - self.assertTrue(old_rng.equals(new_rng)) + self.assert_index_equal(old_rng, new_rng) # test get_legacy_offset_name offset = datetools.get_offset(new_freq) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index 1f0632377c851..b80ee4c5c1e39 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -263,7 +263,7 @@ def test_create_with_fixed_tz(self): self.assertEqual(off, rng.tz) rng2 = date_range(start, periods=len(rng), tz=off) - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) rng3 = date_range('3/11/2012 05:00:00+07:00', '6/11/2012 05:00:00+07:00') @@ -287,7 +287,7 @@ def test_date_range_localize(self): rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') rng3 = rng3.tz_localize('US/Eastern') - self.assertTrue(rng.equals(rng3)) + self.assert_index_equal(rng, rng3) # DST transition time val = rng[0] @@ -296,14 +296,14 @@ def test_date_range_localize(self): self.assertEqual(val.hour, 3) self.assertEqual(exp.hour, 3) self.assertEqual(val, exp) # same UTC value - self.assertTrue(rng[:2].equals(rng2)) + self.assert_index_equal(rng[:2], rng2) # Right before the DST transition rng = date_range('3/11/2012 00:00', periods=2, freq='H', tz='US/Eastern') rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], tz='US/Eastern') - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') self.assertEqual(exp.hour, 0) self.assertEqual(rng[0], exp) @@ -402,7 +402,7 @@ def test_tz_localize(self): dr = bdate_range('1/1/2009', '1/1/2010') dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) localized = dr.tz_localize(pytz.utc) - self.assert_numpy_array_equal(dr_utc, localized) + self.assert_index_equal(dr_utc, localized) def test_with_tz_ambiguous_times(self): tz = self.tz('US/Eastern') @@ -440,22 +440,22 @@ def test_ambiguous_infer(self): '11/06/2011 02:00', '11/06/2011 03:00'] di = DatetimeIndex(times) localized = di.tz_localize(tz, ambiguous='infer') - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) with tm.assert_produces_warning(FutureWarning): localized_old = di.tz_localize(tz, infer_dst=True) - self.assert_numpy_array_equal(dr, localized_old) - self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous='infer')) + self.assert_index_equal(dr, localized_old) + self.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous='infer')) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, freq=datetools.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, ambiguous='infer') - self.assert_numpy_array_equal(localized, localized_infer) + self.assert_index_equal(localized, localized_infer) with tm.assert_produces_warning(FutureWarning): localized_infer_old = dr.tz_localize(tz, infer_dst=True) - self.assert_numpy_array_equal(localized, localized_infer_old) + self.assert_index_equal(localized, localized_infer_old) def test_ambiguous_flags(self): # November 6, 2011, fall back, repeat 2 AM hour @@ -471,20 +471,20 @@ def test_ambiguous_flags(self): di = DatetimeIndex(times) is_dst = [1, 1, 0, 0, 0] localized = di.tz_localize(tz, ambiguous=is_dst) - self.assert_numpy_array_equal(dr, localized) - self.assert_numpy_array_equal(dr, DatetimeIndex(times, tz=tz, - ambiguous=is_dst)) + self.assert_index_equal(dr, localized) + self.assert_index_equal(dr, DatetimeIndex(times, tz=tz, + ambiguous=is_dst)) localized = di.tz_localize(tz, ambiguous=np.array(is_dst)) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) localized = di.tz_localize(tz, ambiguous=np.array(is_dst).astype('bool')) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # Test constructor localized = DatetimeIndex(times, tz=tz, ambiguous=is_dst) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # Test duplicate times where infer_dst fails times += times @@ -497,7 +497,7 @@ def test_ambiguous_flags(self): is_dst = np.hstack((is_dst, is_dst)) localized = di.tz_localize(tz, ambiguous=is_dst) dr = dr.append(dr) - self.assert_numpy_array_equal(dr, localized) + self.assert_index_equal(dr, localized) # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, @@ -505,7 +505,7 @@ def test_ambiguous_flags(self): is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) - self.assert_numpy_array_equal(localized, localized_is_dst) + self.assert_index_equal(localized, localized_is_dst) # construction with an ambiguous end-point # GH 11626 @@ -531,7 +531,10 @@ def test_ambiguous_nat(self): times = ['11/06/2011 00:00', np.NaN, np.NaN, '11/06/2011 02:00', '11/06/2011 03:00'] di_test = DatetimeIndex(times, tz='US/Eastern') - self.assert_numpy_array_equal(di_test, localized) + + # left dtype is datetime64[ns, US/Eastern] + # right is datetime64[ns, tzfile('/usr/share/zoneinfo/US/Eastern')] + self.assert_numpy_array_equal(di_test.values, localized.values) def test_nonexistent_raise_coerce(self): # See issue 13057 @@ -580,7 +583,7 @@ def test_tz_string(self): tz=self.tzstr('US/Eastern')) expected = date_range('1/1/2000', periods=10, tz=self.tz('US/Eastern')) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) def test_take_dont_lose_meta(self): tm._skip_if_no_pytz() @@ -673,7 +676,7 @@ def test_convert_tz_aware_datetime_datetime(self): self.assertTrue(self.cmptz(result.tz, self.tz('US/Eastern'))) converted = to_datetime(dates_aware, utc=True) - ex_vals = [Timestamp(x).value for x in dates_aware] + ex_vals = np.array([Timestamp(x).value for x in dates_aware]) self.assert_numpy_array_equal(converted.asi8, ex_vals) self.assertIs(converted.tz, pytz.utc) @@ -779,10 +782,11 @@ def test_date_range_span_dst_transition(self): self.assertTrue((dr.hour == 0).all()) def test_convert_datetime_list(self): - dr = date_range('2012-06-02', periods=10, tz=self.tzstr('US/Eastern')) + dr = date_range('2012-06-02', periods=10, + tz=self.tzstr('US/Eastern'), name='foo') dr2 = DatetimeIndex(list(dr), name='foo') - self.assertTrue(dr.equals(dr2)) + self.assert_index_equal(dr, dr2) self.assertEqual(dr.tz, dr2.tz) self.assertEqual(dr2.name, 'foo') @@ -845,7 +849,7 @@ def test_datetimeindex_tz(self): idx4 = DatetimeIndex(np.array(arr), tz=self.tzstr('US/Eastern')) for other in [idx2, idx3, idx4]: - self.assertTrue(idx1.equals(other)) + self.assert_index_equal(idx1, other) def test_datetimeindex_tz_nat(self): idx = to_datetime([Timestamp("2013-1-1", tz=self.tzstr('US/Eastern')), @@ -1011,7 +1015,7 @@ def test_tz_localize_naive(self): conv = rng.tz_localize('US/Pacific') exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') - self.assertTrue(conv.equals(exp)) + self.assert_index_equal(conv, exp) def test_tz_localize_roundtrip(self): for tz in self.timezones: @@ -1143,7 +1147,7 @@ def test_join_aware(self): result = test1.join(test2, how='outer') ex_index = test1.index.union(test2.index) - self.assertTrue(result.index.equals(ex_index)) + self.assert_index_equal(result.index, ex_index) self.assertTrue(result.index.tz.zone == 'US/Central') # non-overlapping @@ -1199,11 +1203,11 @@ def test_append_aware_naive(self): ts1 = Series(np.random.randn(len(rng1)), index=rng1) ts2 = Series(np.random.randn(len(rng2)), index=rng2) ts_result = ts1.append(ts2) + self.assertTrue(ts_result.index.equals(ts1.index.asobject.append( ts2.index.asobject))) # mixed - rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') rng2 = lrange(100) ts1 = Series(np.random.randn(len(rng1)), index=rng1) @@ -1280,7 +1284,7 @@ def test_datetimeindex_tz(self): rng = date_range('03/12/2012 00:00', periods=10, freq='W-FRI', tz='US/Eastern') rng2 = DatetimeIndex(data=rng, tz='US/Eastern') - self.assertTrue(rng.equals(rng2)) + self.assert_index_equal(rng, rng2) def test_normalize_tz(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D', @@ -1289,7 +1293,7 @@ def test_normalize_tz(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz='US/Eastern') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1298,7 +1302,7 @@ def test_normalize_tz(self): result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz='UTC') - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1307,7 +1311,7 @@ def test_normalize_tz(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D', tz=tzlocal()) result = rng.normalize() expected = date_range('1/1/2000', periods=10, freq='D', tz=tzlocal()) - self.assertTrue(result.equals(expected)) + self.assert_index_equal(result, expected) self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) @@ -1324,45 +1328,45 @@ def test_tzaware_offset(self): '2010-11-01 07:00'], freq='H', tz=tz) offset = dates + offsets.Hour(5) - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) offset = dates + np.timedelta64(5, 'h') - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) offset = dates + timedelta(hours=5) - self.assertTrue(offset.equals(expected)) + self.assert_index_equal(offset, expected) def test_nat(self): # GH 5546 dates = [NaT] idx = DatetimeIndex(dates) idx = idx.tz_localize('US/Pacific') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Eastern')) idx = idx.tz_convert('UTC') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='UTC'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='UTC')) dates = ['2010-12-01 00:00', '2010-12-02 00:00', NaT] idx = DatetimeIndex(dates) idx = idx.tz_localize('US/Pacific') - self.assertTrue(idx.equals(DatetimeIndex(dates, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(dates, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 03:00', '2010-12-02 03:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) idx = idx + offsets.Hour(5) expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) idx = idx.tz_convert('US/Pacific') expected = ['2010-12-01 05:00', '2010-12-02 05:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) idx = idx + np.timedelta64(3, 'h') expected = ['2010-12-01 08:00', '2010-12-02 08:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Pacific'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Pacific')) idx = idx.tz_convert('US/Eastern') expected = ['2010-12-01 11:00', '2010-12-02 11:00', NaT] - self.assertTrue(idx.equals(DatetimeIndex(expected, tz='US/Eastern'))) + self.assert_index_equal(idx, DatetimeIndex(expected, tz='US/Eastern')) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 8414a5ed42991..d7426daa794c3 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -812,8 +812,9 @@ def test_parsers_time(self): self.assert_series_equal(tools.to_time(Series(arg, name="test")), Series(expected_arr, name="test")) - self.assert_numpy_array_equal(tools.to_time(np.array(arg)), - np.array(expected_arr, dtype=np.object_)) + res = tools.to_time(np.array(arg)) + self.assertIsInstance(res, list) + self.assert_equal(res, expected_arr) def test_parsers_monthfreq(self): cases = {'201101': datetime.datetime(2011, 1, 1, 0, 0), diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e39dc441bcca4..f2b5bf7d2739d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -31,7 +31,6 @@ from pandas.core.algorithms import take_1d import pandas.compat as compat -import pandas.lib as lib from pandas.compat import( filter, map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, raise_with_traceback, httplib, is_platform_windows, is_platform_32bit, @@ -116,25 +115,39 @@ def assertNotAlmostEquals(self, *args, **kwargs): self.assertNotAlmostEqual)(*args, **kwargs) -def assert_almost_equal(left, right, check_exact=False, **kwargs): +def assert_almost_equal(left, right, check_exact=False, + check_dtype='equiv', **kwargs): if isinstance(left, pd.Index): return assert_index_equal(left, right, check_exact=check_exact, - **kwargs) + exact=check_dtype, **kwargs) elif isinstance(left, pd.Series): return assert_series_equal(left, right, check_exact=check_exact, - **kwargs) + check_dtype=check_dtype, **kwargs) elif isinstance(left, pd.DataFrame): return assert_frame_equal(left, right, check_exact=check_exact, - **kwargs) + check_dtype=check_dtype, **kwargs) - return _testing.assert_almost_equal(left, right, **kwargs) + else: + # other sequences + if check_dtype: + if is_number(left) and is_number(right): + # do not compare numeric classes, like np.float64 and float + pass + else: + if (isinstance(left, np.ndarray) or + isinstance(right, np.ndarray)): + obj = 'numpy array' + else: + obj = 'Input' + assert_class_equal(left, right, obj=obj) + return _testing.assert_almost_equal(left, right, + check_dtype=check_dtype, **kwargs) def assert_dict_equal(left, right, compare_keys=True): - # instance validation assertIsInstance(left, dict, '[dict] ') assertIsInstance(right, dict, '[dict] ') @@ -966,33 +979,29 @@ def assert_numpy_array_equal(left, right, strict_nan=False, assertion message """ + # instance validation + # to show a detailed erorr message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + assertIsInstance(left, np.ndarray, '[ndarray] ') + assertIsInstance(right, np.ndarray, '[ndarray] ') + def _raise(left, right, err_msg): if err_msg is None: - # show detailed error - if lib.isscalar(left) and lib.isscalar(right): - # show scalar comparison error - assert_equal(left, right) - elif is_list_like(left) and is_list_like(right): - # some test cases pass list - left = np.asarray(left) - right = np.array(right) - - if left.shape != right.shape: - raise_assert_detail(obj, '{0} shapes are different' - .format(obj), left.shape, right.shape) - - diff = 0 - for l, r in zip(left, right): - # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): - diff += 1 - - diff = diff * 100.0 / left.size - msg = '{0} values are different ({1} %)'\ - .format(obj, np.round(diff, 5)) - raise_assert_detail(obj, msg, left, right) - else: - assert_class_equal(left, right, obj=obj) + if left.shape != right.shape: + raise_assert_detail(obj, '{0} shapes are different' + .format(obj), left.shape, right.shape) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = '{0} values are different ({1} %)'\ + .format(obj, np.round(diff, 5)) + raise_assert_detail(obj, msg, left, right) raise AssertionError(err_msg) @@ -1076,8 +1085,8 @@ def assert_series_equal(left, right, check_dtype=True, if check_exact: assert_numpy_array_equal(left.get_values(), right.get_values(), - obj='{0}'.format(obj), - check_dtype=check_dtype) + check_dtype=check_dtype, + obj='{0}'.format(obj),) elif check_datetimelike_compat: # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check @@ -1093,7 +1102,7 @@ def assert_series_equal(left, right, check_dtype=True, msg = '[datetimelike_compat=True] {0} is not equal to {1}.' raise AssertionError(msg.format(left.values, right.values)) else: - assert_numpy_array_equal(left.values, right.values, + assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) else: _testing.assert_almost_equal(left.get_values(), right.get_values(), @@ -1314,11 +1323,7 @@ def assert_sp_array_equal(left, right): raise_assert_detail('SparseArray.index', 'index are not equal', left.sp_index, right.sp_index) - if np.isnan(left.fill_value): - assert (np.isnan(right.fill_value)) - else: - assert (left.fill_value == right.fill_value) - + assert_attr_equal('fill_value', left, right) assert_attr_equal('dtype', left, right) assert_numpy_array_equal(left.values, right.values) From af4ed0f645685d322bc5b6ece575628f0985ad72 Mon Sep 17 00:00:00 2001 From: Mortada Mehyar Date: Sat, 28 May 2016 13:30:25 -0400 Subject: [PATCH 66/96] DOC: remove references to deprecated numpy negation method Author: Mortada Mehyar Closes #13310 from mortada/deprecated_negation_in_docs and squashes the following commits: 3b46a1b [Mortada Mehyar] DOC: remove references to deprecated numpy negation method --- pandas/core/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 03fe71d4f5125..d26c59e62de30 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -142,7 +142,7 @@ def _isnull_old(obj): def _use_inf_as_null(key): """Option change callback for null/inf behaviour - Choose which replacement for numpy.isnan / -numpy.isfinite is used. + Choose which replacement for numpy.isnan / ~numpy.isfinite is used. Parameters ---------- @@ -233,7 +233,7 @@ def _isnull_ndarraylike_old(obj): def notnull(obj): - """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + """Replacement for numpy.isfinite / ~numpy.isnan which is suitable for use on object arrays. Parameters @@ -1115,7 +1115,7 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): def _possibly_infer_to_datetimelike(value, convert_dates=False): """ - we might have a array (or single object) that is datetime like, + we might have an array (or single object) that is datetime like, and no dtype is passed don't change the value unless we find a datetime/timedelta set From 70be8a95ebeb638aa53f465ba885a7b33b6c35d9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 29 May 2016 10:45:10 -0400 Subject: [PATCH 67/96] DOC: Fix read_stata docstring Author: sinhrks Closes #13312 from sinhrks/stata_doc and squashes the following commits: a9a7357 [sinhrks] DOC: Fix read_stata docstring --- pandas/io/stata.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6c6e11a53d2d3..ae7200cf6fb2e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -89,12 +89,14 @@ Examples -------- Read a Stata dta file: ->> df = pandas.read_stata('filename.dta') + +>>> df = pandas.read_stata('filename.dta') Read a Stata dta file in 10,000 line chunks: ->> itr = pandas.read_stata('filename.dta', chunksize=10000) ->> for chunk in itr: ->> do_something(chunk) + +>>> itr = pandas.read_stata('filename.dta', chunksize=10000) +>>> for chunk in itr: +>>> do_something(chunk) """ % (_statafile_processing_params1, _encoding_params, _statafile_processing_params2, _chunksize_params, _iterator_params) From 721be6297ec571b18db1d1e212558fa0d4e149eb Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 30 May 2016 09:26:41 -0400 Subject: [PATCH 68/96] BUG: Check for NaN after data conversion to numeric Author: gfyoung Closes #13314 from gfyoung/nan-check-post-numeric-conversion and squashes the following commits: 07f0538 [gfyoung] BUG: Check for NaN after data conversion to numeric --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/tests/parser/na_values.py | 77 ++++++++++------------------- pandas/src/inference.pyx | 8 ++- pandas/tests/test_lib.py | 13 +++++ 4 files changed, 46 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index dfb5ebc9379b1..262ad9773b71f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -291,6 +291,7 @@ Bug Fixes +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index c34549835cb46..b03ae4ae9fc22 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -11,7 +11,7 @@ import pandas.io.parsers as parsers import pandas.util.testing as tm -from pandas import DataFrame, MultiIndex, read_csv +from pandas import DataFrame, MultiIndex from pandas.compat import StringIO, range @@ -43,57 +43,30 @@ def test_detect_string_na(self): tm.assert_numpy_array_equal(df.values, expected) def test_non_string_na_values(self): - # see gh-3611, na_values that are not a string are an issue - with tm.ensure_clean('__non_string_na_values__.csv') as path: - df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) - df.to_csv(path, sep=' ', index=False) - result1 = self.read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = self.read_csv(path, sep=' ', header=0, - na_values=[-999, -999.0]) - result3 = self.read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, result2) - tm.assert_frame_equal(result2, result3) - - result4 = self.read_csv( - path, sep=' ', header=0, na_values=['-999.0']) - result5 = self.read_csv( - path, sep=' ', header=0, na_values=['-999']) - result6 = self.read_csv( - path, sep=' ', header=0, na_values=[-999.0]) - result7 = self.read_csv( - path, sep=' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4, result3) - tm.assert_frame_equal(result5, result3) - tm.assert_frame_equal(result6, result3) - tm.assert_frame_equal(result7, result3) - - good_compare = result3 - - # with an odd float format, so we can't match the string 999.0 - # exactly, but need float matching - # TODO: change these to self.read_csv when Python bug is squashed - df.to_csv(path, sep=' ', index=False, float_format='%.3f') - result1 = read_csv(path, sep=' ', header=0, - na_values=['-999.0', '-999']) - result2 = read_csv(path, sep=' ', header=0, - na_values=[-999.0, -999]) - tm.assert_frame_equal(result1, good_compare) - tm.assert_frame_equal(result2, good_compare) - - result3 = read_csv(path, sep=' ', - header=0, na_values=['-999.0']) - result4 = read_csv(path, sep=' ', - header=0, na_values=['-999']) - result5 = read_csv(path, sep=' ', - header=0, na_values=[-999.0]) - result6 = read_csv(path, sep=' ', - header=0, na_values=[-999]) - tm.assert_frame_equal(result3, good_compare) - tm.assert_frame_equal(result4, good_compare) - tm.assert_frame_equal(result5, good_compare) - tm.assert_frame_equal(result6, good_compare) + # see gh-3611: with an odd float format, we can't match + # the string '999.0' exactly but still need float matching + nice = """A,B +-999,1.2 +2,-999 +3,4.5 +""" + ugly = """A,B +-999,1.200 +2,-999.000 +3,4.500 +""" + na_values_param = [['-999.0', '-999'], + [-999, -999.0], + [-999.0, -999], + ['-999.0'], ['-999'], + [-999.0], [-999]] + expected = DataFrame([[np.nan, 1.2], [2.0, np.nan], + [3.0, 4.5]], columns=['A', 'B']) + + for data in (nice, ugly): + for na_values in na_values_param: + out = self.read_csv(StringIO(data), na_values=na_values) + tm.assert_frame_equal(out, expected) def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 3ccc1c4f9336c..e2c59a34bdf21 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -596,7 +596,13 @@ def maybe_convert_numeric(object[:] values, set na_values, else: try: status = floatify(val, &fval, &maybe_int) - floats[i] = fval + + if fval in na_values: + floats[i] = complexes[i] = nan + seen_float = True + else: + floats[i] = fval + if not seen_float: if maybe_int: as_int = int(val) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2aa31063df446..c6a703673a4c4 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -188,6 +188,9 @@ def test_isinf_scalar(self): self.assertFalse(lib.isneginf_scalar(1)) self.assertFalse(lib.isneginf_scalar('a')) + +# tests related to functions imported from inference.pyx +class TestInference(tm.TestCase): def test_maybe_convert_numeric_infinities(self): # see gh-13274 infinities = ['inf', 'inF', 'iNf', 'Inf', @@ -227,6 +230,16 @@ def test_maybe_convert_numeric_infinities(self): np.array(['foo_' + infinity], dtype=object), na_values, maybe_int) + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + class Testisscalar(tm.TestCase): From ed4cd3a6051c735bc2fc5cc6a81aac83687378e5 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Mon, 30 May 2016 10:19:58 -0400 Subject: [PATCH 69/96] TST: Parser tests refactoring 1) Moved no columns test from CParser-only to `common.py` 2) Moved erroneous placed skiprows tests into their proper place Author: gfyoung Closes #13319 from gfyoung/test-parsers-refactor and squashes the following commits: bc1402e [gfyoung] TST: Parser tests refactoring --- pandas/io/tests/parser/c_parser_only.py | 9 -- pandas/io/tests/parser/common.py | 9 ++ pandas/io/tests/parser/na_values.py | 114 ------------------------ pandas/io/tests/parser/skiprows.py | 114 ++++++++++++++++++++++++ 4 files changed, 123 insertions(+), 123 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index aeee77bb02e98..9dde669c9d39d 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -419,15 +419,6 @@ def test_tokenize_CR_with_quoting(self): expected = self.read_csv(StringIO(data.replace('\r', '\n'))) tm.assert_frame_equal(result, expected) - def test_raise_on_no_columns(self): - # single newline - data = "\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - - # test with more than a single newline - data = "\n\n\n" - self.assertRaises(ValueError, self.read_csv, StringIO(data)) - def test_grow_boundary_at_cap(self): # See gh-12494 # diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 14f4de853e118..2e3c102948cfa 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1323,3 +1323,12 @@ def test_inf_parsing(self): # TODO: remove condition when 'na_filter' is supported for Python df = self.read_csv(StringIO(data), index_col=0, na_filter=False) tm.assert_almost_equal(df['A'].values, expected.values) + + def test_raise_on_no_columns(self): + # single newline + data = "\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + # test with more than a single newline + data = "\n\n\n" + self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index b03ae4ae9fc22..4705fd08af2b4 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -223,117 +223,3 @@ def test_na_values_keep_default(self): 'Three': ['None', 'two', 'None', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - def test_skiprow_with_newline(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""" - expected = [[2, 'line 21\nline 22', 2], - [3, 'line 31', 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = ('a,b,c\n~a\n b~,~e\n d~,' - '~f\n f~\n1,2,~12\n 13\n 14~') - expected = [['a\n b', 'e\n d', 'f\n f']] - expected = DataFrame(expected, columns=[ - 'a', 'b', 'c']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[2]) - tm.assert_frame_equal(df, expected) - - data = ('Text,url\n~example\n ' - 'sentence\n one~,url1\n~' - 'example\n sentence\n two~,url2\n~' - 'example\n sentence\n three~,url3') - expected = [['example\n sentence\n two', 'url2']] - expected = DataFrame(expected, columns=[ - 'Text', 'url']) - df = self.read_csv(StringIO(data), - quotechar="~", - skiprows=[1, 3]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line '11' line 12",2 -2,"line '21' line 22",2 -3,"line '31' line 32",1""" - expected = [[2, "line '21' line 22", 2], - [3, "line '31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprow_with_newline_and_quote(self): - # see gh-12775 and gh-10911 - data = """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""" - expected = [[2, "line \n'21' line 22", 2], - [3, "line \n'31' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""" - expected = [[2, "line '21\n' line 22", 2], - [3, "line '31\n' line 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - data = """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""" - expected = [[2, "line '21\n' \r\tline 22", 2], - [3, "line '31\n' \r\tline 32", 1]] - expected = DataFrame(expected, columns=[ - 'id', 'text', 'num_lines']) - df = self.read_csv(StringIO(data), skiprows=[1]) - tm.assert_frame_equal(df, expected) - - def test_skiprows_lineterminator(self): - # see gh-9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - - # test with default line terminators "LF" and "CRLF" - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - # "CR" is not respected with the Python parser yet - if self.engine == 'c': - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) diff --git a/pandas/io/tests/parser/skiprows.py b/pandas/io/tests/parser/skiprows.py index 3e585a9a623c9..c9f50dec6c01e 100644 --- a/pandas/io/tests/parser/skiprows.py +++ b/pandas/io/tests/parser/skiprows.py @@ -76,3 +76,117 @@ def test_skiprows_blank(self): datetime(2000, 1, 3)]) expected.index.name = 0 tm.assert_frame_equal(data, expected) + + def test_skiprow_with_newline(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line 11 +line 12",2 +2,"line 21 +line 22",2 +3,"line 31",1""" + expected = [[2, 'line 21\nline 22', 2], + [3, 'line 31', 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = ('a,b,c\n~a\n b~,~e\n d~,' + '~f\n f~\n1,2,~12\n 13\n 14~') + expected = [['a\n b', 'e\n d', 'f\n f']] + expected = DataFrame(expected, columns=[ + 'a', 'b', 'c']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[2]) + tm.assert_frame_equal(df, expected) + + data = ('Text,url\n~example\n ' + 'sentence\n one~,url1\n~' + 'example\n sentence\n two~,url2\n~' + 'example\n sentence\n three~,url3') + expected = [['example\n sentence\n two', 'url2']] + expected = DataFrame(expected, columns=[ + 'Text', 'url']) + df = self.read_csv(StringIO(data), + quotechar="~", + skiprows=[1, 3]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line '11' line 12",2 +2,"line '21' line 22",2 +3,"line '31' line 32",1""" + expected = [[2, "line '21' line 22", 2], + [3, "line '31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprow_with_newline_and_quote(self): + # see gh-12775 and gh-10911 + data = """id,text,num_lines +1,"line \n'11' line 12",2 +2,"line \n'21' line 22",2 +3,"line \n'31' line 32",1""" + expected = [[2, "line \n'21' line 22", 2], + [3, "line \n'31' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' line 12",2 +2,"line '21\n' line 22",2 +3,"line '31\n' line 32",1""" + expected = [[2, "line '21\n' line 22", 2], + [3, "line '31\n' line 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + data = """id,text,num_lines +1,"line '11\n' \r\tline 12",2 +2,"line '21\n' \r\tline 22",2 +3,"line '31\n' \r\tline 32",1""" + expected = [[2, "line '21\n' \r\tline 22", 2], + [3, "line '31\n' \r\tline 32", 1]] + expected = DataFrame(expected, columns=[ + 'id', 'text', 'num_lines']) + df = self.read_csv(StringIO(data), skiprows=[1]) + tm.assert_frame_equal(df, expected) + + def test_skiprows_lineterminator(self): + # see gh-9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + + # test with default line terminators "LF" and "CRLF" + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + # "CR" is not respected with the Python parser yet + if self.engine == 'c': + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) From cc1025a62019215a0fa38a891e07e6ca6ba656f1 Mon Sep 17 00:00:00 2001 From: Jenn Olsen Date: Mon, 30 May 2016 10:34:28 -0400 Subject: [PATCH 70/96] COMPAT: do not upcast results to float64 when float32 scalar *+/- float64 array closes #12388 Author: Jenn Olsen Closes #12559 from jennolsen84/noevalupcast and squashes the following commits: 3f61252 [Jenn Olsen] do not upcast to float64 everytime --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/computation/expr.py | 15 ++++++++++++++ pandas/computation/ops.py | 14 +++++++++++-- pandas/computation/tests/test_eval.py | 29 +++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 262ad9773b71f..2b67aca1dcf74 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -89,6 +89,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) .. _whatsnew_0182.api: diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 01d0fa664ac41..f1cf210754d12 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -5,6 +5,7 @@ import tokenize from functools import partial +import numpy as np import pandas as pd from pandas import compat @@ -356,6 +357,19 @@ def _possibly_transform_eq_ne(self, node, left=None, right=None): right) return op, op_class, left, right + def _possibly_downcast_constants(self, left, right): + f32 = np.dtype(np.float32) + if left.isscalar and not right.isscalar and right.return_type == f32: + # right is a float32 array, left is a scalar + name = self.env.add_tmp(np.float32(left.value)) + left = self.term_type(name, self.env) + if right.isscalar and not left.isscalar and left.return_type == f32: + # left is a float32 array, right is a scalar + name = self.env.add_tmp(np.float32(right.value)) + right = self.term_type(name, self.env) + + return left, right + def _possibly_eval(self, binop, eval_in_python): # eval `in` and `not in` (for now) in "partial" python space # things that can be evaluated in "eval" space will be turned into @@ -399,6 +413,7 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, def visit_BinOp(self, node, **kwargs): op, op_class, left, right = self._possibly_transform_eq_ne(node) + left, right = self._possibly_downcast_constants(left, right) return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 603c030dcaa6e..bf6fa35cf255f 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -276,18 +276,26 @@ def _not_in(x, y): _binary_ops_dict.update(d) -def _cast_inplace(terms, dtype): +def _cast_inplace(terms, acceptable_dtypes, dtype): """Cast an expression inplace. Parameters ---------- terms : Op The expression that should cast. + acceptable_dtypes : list of acceptable numpy.dtype + Will not cast if term's dtype in this list. + + .. versionadded:: 0.18.2 + dtype : str or numpy.dtype The dtype to cast to. """ dt = np.dtype(dtype) for term in terms: + if term.type in acceptable_dtypes: + continue + try: new_value = term.value.astype(dt) except AttributeError: @@ -452,7 +460,9 @@ def __init__(self, lhs, rhs, truediv, *args, **kwargs): rhs.return_type)) if truediv or PY3: - _cast_inplace(com.flatten(self), np.float_) + # do not upcast float32s to float64 un-necessarily + acceptable_dtypes = [np.float32, np.float_] + _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) _unary_ops_syms = '+', '-', '~', 'not' diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index aaafcb5b41645..4dc1e24618a83 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -749,6 +749,35 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ENGINES_PARSERS = list(product(_engines, expr._parsers)) +#------------------------------------- +# typecasting rules consistency with python +# issue #12388 + +class TestTypeCasting(tm.TestCase): + + def check_binop_typecasting(self, engine, parser, op, dt): + tm.skip_if_no_ne(engine) + df = mkdf(5, 3, data_gen_f=f, dtype=dt) + s = 'df {} 3'.format(op) + res = pd.eval(s, engine=engine, parser=parser) + self.assertTrue(df.values.dtype == dt) + self.assertTrue(res.values.dtype == dt) + assert_frame_equal(res, eval(s)) + + s = '3 {} df'.format(op) + res = pd.eval(s, engine=engine, parser=parser) + self.assertTrue(df.values.dtype == dt) + self.assertTrue(res.values.dtype == dt) + assert_frame_equal(res, eval(s)) + + def test_binop_typecasting(self): + for engine, parser in ENGINES_PARSERS: + for op in ['+', '-', '*', '**', '/']: + # maybe someday... numexpr has too many upcasting rules now + #for dt in chain(*(np.sctypes[x] for x in ['uint', 'int', 'float'])): + for dt in [np.float32, np.float64]: + yield self.check_binop_typecasting, engine, parser, op, dt + #------------------------------------- # basic and complex alignment From d6f814c6b08b8ad627673dc71ed63b202cb46b70 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 May 2016 10:40:09 -0400 Subject: [PATCH 71/96] TST: remove tests_tseries.py and distribute to other tests files Author: Jeff Reback Closes #13325 from jreback/reorg_tests and squashes the following commits: c7e045e [Jeff Reback] TST: remove tests_tseries.py and distribute to other tests files --- pandas/tests/test_algos.py | 316 +++++++++- pandas/tests/test_infer_and_convert.py | 384 ++++++++++++ pandas/tests/test_lib.py | 269 ++------- pandas/tests/test_tseries.py | 714 ----------------------- pandas/tseries/tests/test_bin_groupby.py | 151 +++++ pandas/tseries/tests/test_period.py | 10 +- pandas/tseries/tests/test_tslib.py | 31 +- 7 files changed, 942 insertions(+), 933 deletions(-) create mode 100644 pandas/tests/test_infer_and_convert.py delete mode 100644 pandas/tests/test_tseries.py create mode 100644 pandas/tseries/tests/test_bin_groupby.py diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 4758c7f979da0..be8468d426946 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,15 +3,20 @@ import numpy as np from numpy.random import RandomState +from numpy import nan +import datetime -from pandas.core.api import Series, Categorical, CategoricalIndex +from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd from pandas import compat +import pandas.algos as _algos +from pandas.compat import lrange import pandas.core.algorithms as algos import pandas.util.testing as tm import pandas.hashtable as hashtable from pandas.compat.numpy import np_array_datetime64_compat +from pandas.util.testing import assert_almost_equal class TestMatch(tm.TestCase): @@ -705,6 +710,315 @@ def test_unique_label_indices(): tm.assert_numpy_array_equal(left, right) +def test_rank(): + tm._skip_if_no_scipy() + from scipy.stats import rankdata + + def _check(arr): + mask = ~np.isfinite(arr) + arr = arr.copy() + result = _algos.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + + +def test_pad_backfill_object_segfault(): + + old = np.array([], dtype='O') + new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') + + result = _algos.pad_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.pad_object(new, old) + expected = np.array([], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.backfill_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert (np.array_equal(result, expected)) + + result = _algos.backfill_object(new, old) + expected = np.array([], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_arrmap(): + values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') + result = _algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) + assert (result.dtype == np.bool_) + + +class TestTseriesUtil(tm.TestCase): + _multiprocess_can_split_ = True + + def test_combineFunc(self): + pass + + def test_reindex(self): + pass + + def test_isnull(self): + pass + + def test_groupby(self): + pass + + def test_groupby_withnull(self): + pass + + def test_backfill(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = _algos.backfill_int64(old.values, new.values) + + expect_filler = np.array([0, 0, 1, 1, 1, 1, + 2, 2, 2, 2, 2, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([1, 4]) + new = Index(lrange(5, 10)) + filler = _algos.backfill_int64(old.values, new.values) + + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + def test_pad(self): + old = Index([1, 5, 10]) + new = Index(lrange(12)) + + filler = _algos.pad_int64(old.values, new.values) + + expect_filler = np.array([-1, 0, 0, 0, 0, 1, + 1, 1, 1, 1, 2, 2], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + # corner case + old = Index([5, 10]) + new = Index(lrange(5)) + filler = _algos.pad_int64(old.values, new.values) + expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) + self.assert_numpy_array_equal(filler, expect_filler) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = _algos.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = _algos.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert (np.array_equal(lidx, exp_lidx)) + assert (np.array_equal(ridx, exp_ridx)) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4], dtype=np.int64) + bexp = np.array([1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.inner_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.outer_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _algos.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _algos.left_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.left_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.outer_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _algos.inner_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_is_lexsorted(): + failure = [ + np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, + 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]), + np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, + 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, + 12, 11, + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, + 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, + 6, 5, + 4, 3, 2, 1, 0])] + + assert (not _algos.is_lexsorted(failure)) + +# def test_get_group_index(): +# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) +# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) +# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) + +# result = lib.get_group_index([a, b], (3, 4)) + +# assert(np.array_equal(result, expected)) + + +def test_groupsort_indexer(): + a = np.random.randint(0, 1000, 100).astype(np.int64) + b = np.random.randint(0, 1000, 100).astype(np.int64) + + result = _algos.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + expected = np.argsort(a, kind='mergesort') + assert (np.array_equal(result, expected)) + + # compare with lexsort + key = a * 1000 + b + result = _algos.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + assert (np.array_equal(result, expected)) + + +def test_ensure_platform_int(): + arr = np.arange(100) + + result = _algos.ensure_platform_int(arr) + assert (result is arr) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py new file mode 100644 index 0000000000000..06e2a82e07dee --- /dev/null +++ b/pandas/tests/test_infer_and_convert.py @@ -0,0 +1,384 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime, timedelta, date, time + +import numpy as np +import pandas as pd +import pandas.lib as lib +import pandas.util.testing as tm +from pandas import Index + +from pandas.compat import long, u, PY2 + + +class TestInference(tm.TestCase): + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(pd.lib.infer_dtype(arr), compare) + + def test_isinf_scalar(self): + # GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) + + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + self.assertTrue(np.all(np.isnan(result))) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_date(self): + + dates = [date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) # noqa + except ImportError: + pass + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + def test_categorical(self): + + # GH 8974 + from pandas import Categorical, Series + arr = Categorical(list('abc')) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + +class TestConvert(tm.TestCase): + + def test_convert_objects(self): + arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') + result = lib.maybe_convert_objects(arr) + self.assertTrue(result.dtype == np.object_) + + def test_convert_objects_ints(self): + # test that we can detect many kinds of integers + dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + + for dtype_str in dtypes: + arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') + self.assertTrue(arr[0].dtype == np.dtype(dtype_str)) + result = lib.maybe_convert_objects(arr) + self.assertTrue(issubclass(result.dtype.type, np.integer)) + + def test_convert_objects_complex_number(self): + for dtype in np.sctypes['complex']: + arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') + self.assertTrue(arr[0].dtype == np.dtype(dtype)) + result = lib.maybe_convert_objects(arr) + self.assertTrue(issubclass(result.dtype.type, np.complexfloating)) + + +class Testisscalar(tm.TestCase): + + def test_isscalar_builtin_scalars(self): + self.assertTrue(lib.isscalar(None)) + self.assertTrue(lib.isscalar(True)) + self.assertTrue(lib.isscalar(False)) + self.assertTrue(lib.isscalar(0.)) + self.assertTrue(lib.isscalar(np.nan)) + self.assertTrue(lib.isscalar('foobar')) + self.assertTrue(lib.isscalar(b'foobar')) + self.assertTrue(lib.isscalar(u('efoobar'))) + self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) + self.assertTrue(lib.isscalar(date(2014, 1, 1))) + self.assertTrue(lib.isscalar(time(12, 0))) + self.assertTrue(lib.isscalar(timedelta(hours=1))) + self.assertTrue(lib.isscalar(pd.NaT)) + + def test_isscalar_builtin_nonscalars(self): + self.assertFalse(lib.isscalar({})) + self.assertFalse(lib.isscalar([])) + self.assertFalse(lib.isscalar([1])) + self.assertFalse(lib.isscalar(())) + self.assertFalse(lib.isscalar((1, ))) + self.assertFalse(lib.isscalar(slice(None))) + self.assertFalse(lib.isscalar(Ellipsis)) + + def test_isscalar_numpy_array_scalars(self): + self.assertTrue(lib.isscalar(np.int64(1))) + self.assertTrue(lib.isscalar(np.float64(1.))) + self.assertTrue(lib.isscalar(np.int32(1))) + self.assertTrue(lib.isscalar(np.object_('foobar'))) + self.assertTrue(lib.isscalar(np.str_('foobar'))) + self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) + self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) + self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) + self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) + + def test_isscalar_numpy_zerodim_arrays(self): + for zerodim in [np.array(1), np.array('foobar'), + np.array(np.datetime64('2014-01-01')), + np.array(np.timedelta64(1, 'h')), + np.array(np.datetime64('NaT'))]: + self.assertFalse(lib.isscalar(zerodim)) + self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) + + def test_isscalar_numpy_arrays(self): + self.assertFalse(lib.isscalar(np.array([]))) + self.assertFalse(lib.isscalar(np.array([[]]))) + self.assertFalse(lib.isscalar(np.matrix('1; 2'))) + + def test_isscalar_pandas_scalars(self): + self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) + self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) + self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) + + def test_lisscalar_pandas_containers(self): + self.assertFalse(lib.isscalar(pd.Series())) + self.assertFalse(lib.isscalar(pd.Series([1]))) + self.assertFalse(lib.isscalar(pd.DataFrame())) + self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) + self.assertFalse(lib.isscalar(pd.Panel())) + self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) + self.assertFalse(lib.isscalar(pd.Index([]))) + self.assertFalse(lib.isscalar(pd.Index([1]))) + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index c6a703673a4c4..bfac0aa83b434 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,19 +1,9 @@ # -*- coding: utf-8 -*- -from datetime import datetime, timedelta, date, time - import numpy as np -import pandas as pd import pandas.lib as lib import pandas.util.testing as tm -from pandas.compat import long, u, PY2 - - -def _assert_same_values_and_dtype(res, exp): - tm.assert_equal(res.dtype, exp.dtype) - tm.assert_almost_equal(res, exp) - class TestMisc(tm.TestCase): @@ -34,16 +24,8 @@ def test_max_len_string_array(self): tm.assertRaises(TypeError, lambda: lib.max_len_string_array(arr.astype('U'))) - def test_infer_dtype_bytes(self): - compare = 'string' if PY2 else 'bytes' - - # string array of bytes - arr = np.array(list('abc'), dtype='S1') - self.assertEqual(pd.lib.infer_dtype(arr), compare) - # object array of bytes - arr = arr.astype(object) - self.assertEqual(pd.lib.infer_dtype(arr), compare) +class TestIndexing(tm.TestCase): def test_maybe_indices_to_slice_left_edge(self): target = np.arange(100) @@ -174,203 +156,58 @@ def test_maybe_indices_to_slice_middle(self): self.assert_numpy_array_equal(maybe_slice, indices) self.assert_numpy_array_equal(target[indices], target[maybe_slice]) - def test_isinf_scalar(self): - # GH 11352 - self.assertTrue(lib.isposinf_scalar(float('inf'))) - self.assertTrue(lib.isposinf_scalar(np.inf)) - self.assertFalse(lib.isposinf_scalar(-np.inf)) - self.assertFalse(lib.isposinf_scalar(1)) - self.assertFalse(lib.isposinf_scalar('a')) - - self.assertTrue(lib.isneginf_scalar(float('-inf'))) - self.assertTrue(lib.isneginf_scalar(-np.inf)) - self.assertFalse(lib.isneginf_scalar(np.inf)) - self.assertFalse(lib.isneginf_scalar(1)) - self.assertFalse(lib.isneginf_scalar('a')) - - -# tests related to functions imported from inference.pyx -class TestInference(tm.TestCase): - def test_maybe_convert_numeric_infinities(self): - # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = set(['', 'NULL', 'nan']) - - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) - - msg = "Unable to parse string" - - for infinity in infinities: - for maybe_int in (True, False): - out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, neg) - - out = lib.maybe_convert_numeric( - np.array([u(infinity)], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - # too many characters - with tm.assertRaisesRegexp(ValueError, msg): - lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) - - def test_maybe_convert_numeric_post_floatify_nan(self): - # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) - expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) - nan_values = set([-999, -999.0]) - - for coerce_type in (True, False): - out = lib.maybe_convert_numeric(data, nan_values, coerce_type) - tm.assert_numpy_array_equal(out, expected) - - -class Testisscalar(tm.TestCase): - - def test_isscalar_builtin_scalars(self): - self.assertTrue(lib.isscalar(None)) - self.assertTrue(lib.isscalar(True)) - self.assertTrue(lib.isscalar(False)) - self.assertTrue(lib.isscalar(0.)) - self.assertTrue(lib.isscalar(np.nan)) - self.assertTrue(lib.isscalar('foobar')) - self.assertTrue(lib.isscalar(b'foobar')) - self.assertTrue(lib.isscalar(u('efoobar'))) - self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) - self.assertTrue(lib.isscalar(date(2014, 1, 1))) - self.assertTrue(lib.isscalar(time(12, 0))) - self.assertTrue(lib.isscalar(timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.NaT)) - - def test_isscalar_builtin_nonscalars(self): - self.assertFalse(lib.isscalar({})) - self.assertFalse(lib.isscalar([])) - self.assertFalse(lib.isscalar([1])) - self.assertFalse(lib.isscalar(())) - self.assertFalse(lib.isscalar((1, ))) - self.assertFalse(lib.isscalar(slice(None))) - self.assertFalse(lib.isscalar(Ellipsis)) - - def test_isscalar_numpy_array_scalars(self): - self.assertTrue(lib.isscalar(np.int64(1))) - self.assertTrue(lib.isscalar(np.float64(1.))) - self.assertTrue(lib.isscalar(np.int32(1))) - self.assertTrue(lib.isscalar(np.object_('foobar'))) - self.assertTrue(lib.isscalar(np.str_('foobar'))) - self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) - self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) - self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) - self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) - - def test_isscalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: - self.assertFalse(lib.isscalar(zerodim)) - self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) - - def test_isscalar_numpy_arrays(self): - self.assertFalse(lib.isscalar(np.array([]))) - self.assertFalse(lib.isscalar(np.array([[]]))) - self.assertFalse(lib.isscalar(np.matrix('1; 2'))) - - def test_isscalar_pandas_scalars(self): - self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) - self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) - - def test_lisscalar_pandas_containers(self): - self.assertFalse(lib.isscalar(pd.Series())) - self.assertFalse(lib.isscalar(pd.Series([1]))) - self.assertFalse(lib.isscalar(pd.DataFrame())) - self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) - self.assertFalse(lib.isscalar(pd.Panel())) - self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) - self.assertFalse(lib.isscalar(pd.Index([]))) - self.assertFalse(lib.isscalar(pd.Index([1]))) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - _assert_same_values_and_dtype(result, expected) - _assert_same_values_and_dtype(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - _assert_same_values_and_dtype(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - _assert_same_values_and_dtype(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - _assert_same_values_and_dtype(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - _assert_same_values_and_dtype(result, expected) + def test_maybe_booleans_to_slice(self): + arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) + result = lib.maybe_booleans_to_slice(arr) + self.assertTrue(result.dtype == np.bool_) + + result = lib.maybe_booleans_to_slice(arr[:0]) + self.assertTrue(result == slice(0, 0)) + + def test_get_reverse_indexer(self): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + self.assertTrue(np.array_equal(result, expected)) + + +def test_duplicated_with_nas(): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = lib.duplicated(keys) + expected = [False, False, False, True, False, True] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='first') + expected = [False, False, False, True, False, True] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') + expected = [True, False, True, False, False, False] + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep=False) + expected = [True, False, True, True, False, True] + assert (np.array_equal(result, expected)) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, + [0, np.nan, 0, np.nan] * 2)): + keys[i] = t + + result = lib.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = falses + trues + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep='last') + expected = trues + falses + assert (np.array_equal(result, expected)) + + result = lib.duplicated(keys, keep=False) + expected = trues + trues + assert (np.array_equal(result, expected)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py deleted file mode 100644 index 4dd1cf54a5527..0000000000000 --- a/pandas/tests/test_tseries.py +++ /dev/null @@ -1,714 +0,0 @@ -# -*- coding: utf-8 -*- -from numpy import nan -import numpy as np -from pandas import Index, isnull, Timestamp -from pandas.util.testing import assert_almost_equal -import pandas.util.testing as tm -from pandas.compat import range, lrange, zip -import pandas.lib as lib -import pandas._period as period -import pandas.algos as algos -from pandas.core import common as com -import datetime - - -class TestTseriesUtil(tm.TestCase): - _multiprocess_can_split_ = True - - def test_combineFunc(self): - pass - - def test_reindex(self): - pass - - def test_isnull(self): - pass - - def test_groupby(self): - pass - - def test_groupby_withnull(self): - pass - - def test_backfill(self): - old = Index([1, 5, 10]) - new = Index(lrange(12)) - - filler = algos.backfill_int64(old.values, new.values) - - expect_filler = np.array([0, 0, 1, 1, 1, 1, - 2, 2, 2, 2, 2, -1], dtype=np.int64) - self.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([1, 4]) - new = Index(lrange(5, 10)) - filler = algos.backfill_int64(old.values, new.values) - - expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) - self.assert_numpy_array_equal(filler, expect_filler) - - def test_pad(self): - old = Index([1, 5, 10]) - new = Index(lrange(12)) - - filler = algos.pad_int64(old.values, new.values) - - expect_filler = np.array([-1, 0, 0, 0, 0, 1, - 1, 1, 1, 1, 2, 2], dtype=np.int64) - self.assert_numpy_array_equal(filler, expect_filler) - - # corner case - old = Index([5, 10]) - new = Index(lrange(5)) - filler = algos.pad_int64(old.values, new.values) - expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.int64) - self.assert_numpy_array_equal(filler, expect_filler) - - -def test_left_join_indexer_unique(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - - result = algos.left_join_indexer_unique_int64(b, a) - expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) - - right = np.array([3, 1], dtype=np.int64) - max_groups = 4 - - lidx, ridx = algos.left_outer_join(left, right, max_groups, sort=False) - - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) - exp_ridx[left == 1] = 1 - exp_ridx[left == 3] = 0 - - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) - - -def test_inner_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.inner_join_indexer_int64(a, b) - - index_exp = np.array([3, 5], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([2, 4], dtype=np.int64) - bexp = np.array([1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.inner_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_outer_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.outer_join_indexer_int64(a, b) - - index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) - bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.outer_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = algos.left_join_indexer_int64(a, b) - - assert_almost_equal(index, a) - - aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) - bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = algos.left_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.left_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.outer_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = algos.inner_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_is_lexsorted(): - failure = [ - np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, - 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]), - np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, - 15, 14, - 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, - 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, - 12, 11, - 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, - 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, - 9, 8, - 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, - 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, - 6, 5, - 4, 3, 2, 1, 0])] - - assert (not algos.is_lexsorted(failure)) - -# def test_get_group_index(): -# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) -# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) -# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) - -# result = lib.get_group_index([a, b], (3, 4)) - -# assert(np.array_equal(result, expected)) - - -def test_groupsort_indexer(): - a = np.random.randint(0, 1000, 100).astype(np.int64) - b = np.random.randint(0, 1000, 100).astype(np.int64) - - result = algos.groupsort_indexer(a, 1000)[0] - - # need to use a stable sort - expected = np.argsort(a, kind='mergesort') - assert (np.array_equal(result, expected)) - - # compare with lexsort - key = a * 1000 + b - result = algos.groupsort_indexer(key, 1000000)[0] - expected = np.lexsort((b, a)) - assert (np.array_equal(result, expected)) - - -def test_ensure_platform_int(): - arr = np.arange(100) - - result = algos.ensure_platform_int(arr) - assert (result is arr) - - -def test_duplicated_with_nas(): - keys = np.array([0, 1, nan, 0, 2, nan], dtype=object) - - result = lib.duplicated(keys) - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='first') - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = [True, False, True, False, False, False] - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = [True, False, True, True, False, True] - assert (np.array_equal(result, expected)) - - keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, nan, nan] * 2, [0, nan, 0, nan] * 2)): - keys[i] = t - - result = lib.duplicated(keys) - falses = [False] * 4 - trues = [True] * 4 - expected = falses + trues - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep='last') - expected = trues + falses - assert (np.array_equal(result, expected)) - - result = lib.duplicated(keys, keep=False) - expected = trues + trues - assert (np.array_equal(result, expected)) - - -def test_maybe_booleans_to_slice(): - arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) - result = lib.maybe_booleans_to_slice(arr) - assert (result.dtype == np.bool_) - - result = lib.maybe_booleans_to_slice(arr[:0]) - assert (result == slice(0, 0)) - - -def test_convert_objects(): - arr = np.array(['a', 'b', nan, nan, 'd', 'e', 'f'], dtype='O') - result = lib.maybe_convert_objects(arr) - assert (result.dtype == np.object_) - - -def test_convert_infs(): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - assert (result.dtype == np.float64) - - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - assert (result.dtype == np.float64) - - -def test_scientific_no_exponent(): - # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False, True) - assert np.all(np.isnan(result)) - - -def test_convert_objects_ints(): - # test that we can detect many kinds of integers - dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - - for dtype_str in dtypes: - arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') - assert (arr[0].dtype == np.dtype(dtype_str)) - result = lib.maybe_convert_objects(arr) - assert (issubclass(result.dtype.type, np.integer)) - - -def test_convert_objects_complex_number(): - for dtype in np.sctypes['complex']: - arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') - assert (arr[0].dtype == np.dtype(dtype)) - result = lib.maybe_convert_objects(arr) - assert (issubclass(result.dtype.type, np.complexfloating)) - - -def test_rank(): - tm._skip_if_no_scipy() - from scipy.stats import rankdata - - def _check(arr): - mask = ~np.isfinite(arr) - arr = arr.copy() - result = algos.rank_1d_float64(arr) - arr[mask] = np.inf - exp = rankdata(arr) - exp[mask] = nan - assert_almost_equal(result, exp) - - _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) - _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) - - -def test_get_reverse_indexer(): - indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) - result = lib.get_reverse_indexer(indexer, 5) - expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_pad_backfill_object_segfault(): - - old = np.array([], dtype='O') - new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') - - result = algos.pad_object(old, new) - expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.pad_object(new, old) - expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.backfill_object(old, new) - expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) - - result = algos.backfill_object(new, old) - expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_arrmap(): - values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') - result = algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) - assert (result.dtype == np.bool_) - - -def test_series_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) - dummy = obj[:0] - - labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - - grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[3:6].mean(), obj[6:].mean()]) - assert_almost_equal(result, expected) - - exp_counts = np.array([3, 4], dtype=np.int64) - assert_almost_equal(counts, exp_counts) - - -def test_series_bin_grouper(): - from pandas import Series - obj = Series(np.random.randn(10)) - dummy = obj[:0] - - bins = np.array([3, 6]) - - grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) - result, counts = grouper.get_result() - - expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) - assert_almost_equal(result, expected) - - exp_counts = np.array([3, 3, 4], dtype=np.int64) - assert_almost_equal(counts, exp_counts) - - -class TestBinGroupers(tm.TestCase): - _multiprocess_can_split_ = True - - def setUp(self): - self.obj = np.random.randn(10, 1) - self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) - self.bins = np.array([3, 6], dtype=np.int64) - - def test_generate_bins(self): - from pandas.core.groupby import generate_bins_generic - values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) - binner = np.array([0, 3, 6, 9], dtype=np.int64) - - for func in [lib.generate_bins_dt64, generate_bins_generic]: - bins = func(values, binner, closed='left') - assert ((bins == np.array([2, 5, 6])).all()) - - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6, 6])).all()) - - for func in [lib.generate_bins_dt64, generate_bins_generic]: - values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) - binner = np.array([0, 3, 6], dtype=np.int64) - - bins = func(values, binner, closed='right') - assert ((bins == np.array([3, 6])).all()) - - self.assertRaises(ValueError, generate_bins_generic, values, [], - 'right') - self.assertRaises(ValueError, generate_bins_generic, values[:0], - binner, 'right') - - self.assertRaises(ValueError, generate_bins_generic, values, [4], - 'right') - self.assertRaises(ValueError, generate_bins_generic, values, [-3, -1], - 'right') - - -def test_group_ohlc(): - def _check(dtype): - obj = np.array(np.random.randn(20), dtype=dtype) - - bins = np.array([6, 12, 20]) - out = np.zeros((3, 4), dtype) - counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) - - func = getattr(algos, 'group_ohlc_%s' % dtype) - func(out, counts, obj[:, None], labels) - - def _ohlc(group): - if isnull(group).all(): - return np.repeat(nan, 4) - return [group[0], group.max(), group.min(), group[-1]] - - expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), - _ohlc(obj[12:])]) - - assert_almost_equal(out, expected) - tm.assert_numpy_array_equal(counts, - np.array([6, 6, 8], dtype=np.int64)) - - obj[:6] = nan - func(out, counts, obj[:, None], labels) - expected[0] = nan - assert_almost_equal(out, expected) - - _check('float32') - _check('float64') - - -def test_try_parse_dates(): - from dateutil.parser import parse - - arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) - - result = lib.try_parse_dates(arr, dayfirst=True) - expected = [parse(d, dayfirst=True) for d in arr] - assert (np.array_equal(result, expected)) - - -class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True - - def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) - self.assertEqual(result, 'integer') - - result = lib.infer_dtype([]) - self.assertEqual(result, 'empty') - - def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - def test_string(self): - pass - - def test_unicode(self): - pass - - def test_datetime(self): - - dates = [datetime.datetime(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'datetime64') - - def test_date(self): - - dates = [datetime.date(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'date') - - def test_to_object_array_tuples(self): - r = (5, 6) - values = [r] - result = lib.to_object_array_tuples(values) - - try: - # make sure record array works - from collections import namedtuple - record = namedtuple('record', 'x y') - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass - - def test_object(self): - - # GH 7431 - # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - def test_categorical(self): - - # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - -class TestMoments(tm.TestCase): - pass - - -class TestReducer(tm.TestCase): - def test_int_index(self): - from pandas.core.series import Series - - arr = np.random.randn(100, 4) - result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) - expected = arr.sum(0) - assert_almost_equal(result, expected) - - result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) - expected = arr.sum(1) - assert_almost_equal(result, expected) - - dummy = Series(0., index=np.arange(100)) - result = lib.reduce(arr, np.sum, dummy=dummy, - labels=Index(np.arange(4))) - expected = arr.sum(0) - assert_almost_equal(result, expected) - - dummy = Series(0., index=np.arange(4)) - result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) - expected = arr.sum(1) - assert_almost_equal(result, expected) - - result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, - labels=Index(np.arange(100))) - assert_almost_equal(result, expected) - - -class TestTsUtil(tm.TestCase): - def test_min_valid(self): - # Ensure that Timestamp.min is a valid Timestamp - Timestamp(Timestamp.min) - - def test_max_valid(self): - # Ensure that Timestamp.max is a valid Timestamp - Timestamp(Timestamp.max) - - def test_to_datetime_bijective(self): - # Ensure that converting to datetime and back only loses precision - # by going from nanoseconds to microseconds. - self.assertEqual( - Timestamp(Timestamp.max.to_pydatetime()).value / 1000, - Timestamp.max.value / 1000) - self.assertEqual( - Timestamp(Timestamp.min.to_pydatetime()).value / 1000, - Timestamp.min.value / 1000) - - -class TestPeriodField(tm.TestCase): - def test_get_period_field_raises_on_out_of_range(self): - self.assertRaises(ValueError, period.get_period_field, -1, 0, 0) - - def test_get_period_field_array_raises_on_out_of_range(self): - self.assertRaises(ValueError, period.get_period_field_arr, -1, - np.empty(1), 0) diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py new file mode 100644 index 0000000000000..6b6c468b7c391 --- /dev/null +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- + +from numpy import nan +import numpy as np + +from pandas import Index, isnull +from pandas.util.testing import assert_almost_equal +import pandas.util.testing as tm +import pandas.lib as lib +import pandas.algos as algos +from pandas.core import common as com + + +def test_series_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +def test_series_bin_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + bins = np.array([3, 6]) + + grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + + +class TestBinGroupers(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.obj = np.random.randn(10, 1) + self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) + self.bins = np.array([3, 6], dtype=np.int64) + + def test_generate_bins(self): + from pandas.core.groupby import generate_bins_generic + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6, 9], dtype=np.int64) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + bins = func(values, binner, closed='left') + assert ((bins == np.array([2, 5, 6])).all()) + + bins = func(values, binner, closed='right') + assert ((bins == np.array([3, 6, 6])).all()) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64) + binner = np.array([0, 3, 6], dtype=np.int64) + + bins = func(values, binner, closed='right') + assert ((bins == np.array([3, 6])).all()) + + self.assertRaises(ValueError, generate_bins_generic, values, [], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values[:0], + binner, 'right') + + self.assertRaises(ValueError, generate_bins_generic, values, [4], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values, [-3, -1], + 'right') + + +def test_group_ohlc(): + def _check(dtype): + obj = np.array(np.random.randn(20), dtype=dtype) + + bins = np.array([6, 12, 20]) + out = np.zeros((3, 4), dtype) + counts = np.zeros(len(out), dtype=np.int64) + labels = com._ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) + + func = getattr(algos, 'group_ohlc_%s' % dtype) + func(out, counts, obj[:, None], labels) + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) + + assert_almost_equal(out, expected) + tm.assert_numpy_array_equal(counts, + np.array([6, 6, 8], dtype=np.int64)) + + obj[:6] = nan + func(out, counts, obj[:, None], labels) + expected[0] = nan + assert_almost_equal(out, expected) + + _check('float32') + _check('float64') + + +class TestMoments(tm.TestCase): + pass + + +class TestReducer(tm.TestCase): + def test_int_index(self): + from pandas.core.series import Series + + arr = np.random.randn(100, 4) + result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(100)) + result = lib.reduce(arr, np.sum, dummy=dummy, + labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(4)) + result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, + labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, dummy=dummy, + labels=Index(np.arange(100))) + assert_almost_equal(result, expected) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 8e6d339b87623..de23306c80b71 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -8,7 +8,7 @@ from datetime import datetime, date, timedelta -from pandas import Timestamp +from pandas import Timestamp, _period from pandas.tseries.frequencies import MONTHS, DAYS, _period_code_map from pandas.tseries.period import Period, PeriodIndex, period_range from pandas.tseries.index import DatetimeIndex, date_range, Index @@ -4450,6 +4450,14 @@ def test_ops_frame_period(self): tm.assert_frame_equal(df - df2, -exp) +class TestPeriodField(tm.TestCase): + def test_get_period_field_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field, -1, 0, 0) + + def test_get_period_field_array_raises_on_out_of_range(self): + self.assertRaises(ValueError, _period.get_period_field_arr, -1, + np.empty(1), 0) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index d7426daa794c3..c6436163b9edb 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -2,7 +2,7 @@ from distutils.version import LooseVersion import numpy as np -from pandas import tslib +from pandas import tslib, lib import pandas._period as period import datetime @@ -25,6 +25,35 @@ from pandas.util.testing import assert_series_equal, _skip_if_has_locale +class TestTsUtil(tm.TestCase): + + def test_try_parse_dates(self): + from dateutil.parser import parse + arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) + + result = lib.try_parse_dates(arr, dayfirst=True) + expected = [parse(d, dayfirst=True) for d in arr] + self.assertTrue(np.array_equal(result, expected)) + + def test_min_valid(self): + # Ensure that Timestamp.min is a valid Timestamp + Timestamp(Timestamp.min) + + def test_max_valid(self): + # Ensure that Timestamp.max is a valid Timestamp + Timestamp(Timestamp.max) + + def test_to_datetime_bijective(self): + # Ensure that converting to datetime and back only loses precision + # by going from nanoseconds to microseconds. + self.assertEqual( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000, + Timestamp.max.value / 1000) + self.assertEqual( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000, + Timestamp.min.value / 1000) + + class TestTimestamp(tm.TestCase): def test_constructor(self): From 9e7bfdd5996d90c774cf6ebd39c6c0779469e545 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 30 May 2016 12:34:53 -0400 Subject: [PATCH 72/96] BLD: increase clone depth --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1f2940404eed0..5a16c1a6c25e7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ env: git: # for cloning - depth: 300 + depth: 500 matrix: fast_finish: true From c0850ea7f4c04a725142408208a49072dbe04b63 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 31 May 2016 09:12:12 -0400 Subject: [PATCH 73/96] ENH: add support for na_filter in Python engine Title is self-explanatory. Author: gfyoung Closes #13321 from gfyoung/python-engine-na-filter and squashes the following commits: 186fd34 [gfyoung] ENH: add support for na_filter in Python engine --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 12 +++++++++--- pandas/io/tests/parser/c_parser_only.py | 6 ------ pandas/io/tests/parser/common.py | 6 ++---- pandas/io/tests/parser/na_values.py | 18 ++++++++++++++++++ pandas/io/tests/parser/parse_dates.py | 7 +++++++ 6 files changed, 37 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2b67aca1dcf74..be38adb96e403 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -75,6 +75,7 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bf4083f61155c..394fe1a98880a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -425,7 +425,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'na_filter', 'compact_ints', 'use_unsigned', 'low_memory', @@ -1188,8 +1187,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, result = {} for c, values in compat.iteritems(dct): conv_f = None if converters is None else converters.get(c, None) - col_na_values, col_na_fvalues = _get_na_values(c, na_values, - na_fvalues) + + if self.na_filter: + col_na_values, col_na_fvalues = _get_na_values( + c, na_values, na_fvalues) + else: + col_na_values, col_na_fvalues = set(), set() + coerce_type = True if conv_f is not None: try: @@ -1634,6 +1638,8 @@ def __init__(self, f, **kwds): self.names_passed = kwds['names'] or None + self.na_filter = kwds['na_filter'] + self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 9dde669c9d39d..00c4e0a1c022b 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -61,12 +61,6 @@ def test_delim_whitespace_custom_terminator(self): columns=['a', 'b', 'c']) tm.assert_frame_equal(df, expected) - def test_parse_dates_empty_string(self): - # see gh-2263 - s = StringIO("Date, test\n2012-01-01, 1\n,2") - result = self.read_csv(s, parse_dates=["Date"], na_filter=False) - self.assertTrue(result['Date'].isnull()[1]) - def test_dtype_and_names_error(self): # see gh-8833: passing both dtype and names # resulting in an error reporting issue diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 2e3c102948cfa..44892dc17c47b 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1319,10 +1319,8 @@ def test_inf_parsing(self): df = self.read_csv(StringIO(data), index_col=0) tm.assert_almost_equal(df['A'].values, expected.values) - if self.engine == 'c': - # TODO: remove condition when 'na_filter' is supported for Python - df = self.read_csv(StringIO(data), index_col=0, na_filter=False) - tm.assert_almost_equal(df['A'].values, expected.values) + df = self.read_csv(StringIO(data), index_col=0, na_filter=False) + tm.assert_almost_equal(df['A'].values, expected.values) def test_raise_on_no_columns(self): # single newline diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 4705fd08af2b4..d826ae536c6cc 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -223,3 +223,21 @@ def test_na_values_keep_default(self): 'Three': ['None', 'two', 'None', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) + + def test_na_values_na_filter_override(self): + data = """\ +A,B +1,A +nan,B +3,C +""" + + expected = DataFrame([[1, 'A'], [np.nan, np.nan], [3, 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=True) + tm.assert_frame_equal(out, expected) + + expected = DataFrame([['1', 'A'], ['nan', 'B'], ['3', 'C']], + columns=['A', 'B']) + out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/parse_dates.py b/pandas/io/tests/parser/parse_dates.py index ec368bb358ad5..01816bde66120 100644 --- a/pandas/io/tests/parser/parse_dates.py +++ b/pandas/io/tests/parser/parse_dates.py @@ -467,3 +467,10 @@ def test_read_with_parse_dates_invalid_type(self): StringIO(data), parse_dates=np.array([4, 5])) tm.assertRaisesRegexp(TypeError, errmsg, self.read_csv, StringIO(data), parse_dates=set([1, 3, 3])) + + def test_parse_dates_empty_string(self): + # see gh-2263 + data = "Date, test\n2012-01-01, 1\n,2" + result = self.read_csv(StringIO(data), parse_dates=["Date"], + na_filter=False) + self.assertTrue(result['Date'].isnull()[1]) From 352ae44398eae5cdfd5de9700939303cc1ccb1bd Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 31 May 2016 09:53:15 -0400 Subject: [PATCH 74/96] TST: more strict testing in lint.sh Author: Jeff Reback Closes #13334 from jreback/testing and squashes the following commits: 7653af9 [Jeff Reback] TST: more strict testing in lint.sh --- ci/lint.sh | 2 +- pandas/io/tests/json/test_pandas.py | 2 +- pandas/io/tests/test_packers.py | 6 ++-- pandas/src/testing.pyx | 20 +++++++---- pandas/tests/test_algos.py | 10 +++--- pandas/tests/test_nanops.py | 34 ++++++++++--------- pandas/tests/test_panel.py | 4 +-- pandas/tests/test_testing.py | 6 ++++ pandas/util/testing.py | 51 ++++++++++++++++++++++------- 9 files changed, 90 insertions(+), 45 deletions(-) diff --git a/ci/lint.sh b/ci/lint.sh index eb4c655e8bd3e..a4c960084040f 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -20,7 +20,7 @@ if [ "$LINT" ]; then echo "Linting DONE" echo "Check for invalid testing" - grep -r --include '*.py' --exclude nosetester.py --exclude testing.py 'numpy.testing' pandas + grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas if [ $? = "0" ]; then RET=1 fi diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 43b8d6b9563f1..9f8aedc2e399e 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -100,7 +100,7 @@ def test_frame_non_unique_index(self): orient='split')) unser = read_json(df.to_json(orient='records'), orient='records') self.assert_index_equal(df.columns, unser.columns) - np.testing.assert_equal(df.values, unser.values) + tm.assert_almost_equal(df.values, unser.values) unser = read_json(df.to_json(orient='values'), orient='values') tm.assert_numpy_array_equal(df.values, unser.values) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index b647ec6b25717..ad7d6c3c9f94f 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -671,14 +671,14 @@ def _test_small_strings_no_warn(self, compress): with tm.assert_produces_warning(None): empty_unpacked = self.encode_decode(empty, compress=compress) - np.testing.assert_array_equal(empty_unpacked, empty) + tm.assert_numpy_array_equal(empty_unpacked, empty) self.assertTrue(empty_unpacked.flags.writeable) char = np.array([ord(b'a')], dtype='uint8') with tm.assert_produces_warning(None): char_unpacked = self.encode_decode(char, compress=compress) - np.testing.assert_array_equal(char_unpacked, char) + tm.assert_numpy_array_equal(char_unpacked, char) self.assertTrue(char_unpacked.flags.writeable) # if this test fails I am sorry because the interpreter is now in a # bad state where b'a' points to 98 == ord(b'b'). @@ -688,7 +688,7 @@ def _test_small_strings_no_warn(self, compress): # always be the same (unless we were able to mutate the shared # character singleton in which case ord(b'a') == ord(b'b'). self.assertEqual(ord(b'a'), ord(u'a')) - np.testing.assert_array_equal( + tm.assert_numpy_array_equal( char_unpacked, np.array([ord(b'b')], dtype='uint8'), ) diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 9f102ded597fd..6780cf311c244 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -55,7 +55,9 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): return True -cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, +cpdef assert_almost_equal(a, b, + check_less_precise=False, + bint check_dtype=True, obj=None, lobj=None, robj=None): """Check that left and right objects are almost equal. @@ -63,9 +65,10 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, ---------- a : object b : object - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. 5 digits (False) or 3 digits (True) after decimal points are compared. + If an integer, then this will be the number of decimal points to compare check_dtype: bool, default True check dtype if both a and b are np.ndarray obj : str, default None @@ -91,6 +94,8 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, if robj is None: robj = b + assert isinstance(check_less_precise, (int, bool)) + if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) @@ -145,7 +150,7 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, for i in xrange(len(a)): try: - assert_almost_equal(a[i], b[i], check_less_precise) + assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) except AssertionError: is_unequal = True diff += 1 @@ -173,11 +178,12 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False, check_dtype=True, # inf comparison return True - decimal = 5 - - # deal with differing dtypes - if check_less_precise: + if check_less_precise is True: decimal = 3 + elif check_less_precise is False: + decimal = 5 + else: + decimal = check_less_precise fa, fb = a, b diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index be8468d426946..8af93ad0ecb2e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -585,7 +585,7 @@ def test_group_var_generic_1d(self): expected_counts = counts + 3 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_1d_flat_labels(self): @@ -601,7 +601,7 @@ def test_group_var_generic_1d_flat_labels(self): self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_2d_all_finite(self): @@ -616,7 +616,7 @@ def test_group_var_generic_2d_all_finite(self): expected_counts = counts + 2 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + self.assertTrue(np.allclose(out, expected_out, self.rtol)) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_generic_2d_some_nan(self): @@ -631,11 +631,11 @@ def test_group_var_generic_2d_some_nan(self): expected_out = np.vstack([values[:, 0] .reshape(5, 2, order='F') .std(ddof=1, axis=1) ** 2, - np.nan * np.ones(5)]).T + np.nan * np.ones(5)]).T.astype(self.dtype) expected_counts = counts + 2 self.algo(out, counts, values, labels) - np.testing.assert_allclose(out, expected_out, self.rtol) + tm.assert_almost_equal(out, expected_out, check_less_precise=6) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_constant(self): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index e244a04127949..904bedde03312 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -799,30 +799,31 @@ def setUp(self): def test_nanvar_all_finite(self): samples = self.samples actual_variance = nanops.nanvar(samples) - np.testing.assert_almost_equal(actual_variance, self.variance, - decimal=2) + tm.assert_almost_equal(actual_variance, self.variance, + check_less_precise=2) def test_nanvar_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_variance = nanops.nanvar(samples, skipna=True) - np.testing.assert_almost_equal(actual_variance, self.variance, - decimal=2) + tm.assert_almost_equal(actual_variance, self.variance, + check_less_precise=2) actual_variance = nanops.nanvar(samples, skipna=False) - np.testing.assert_almost_equal(actual_variance, np.nan, decimal=2) + tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) def test_nanstd_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_std = nanops.nanstd(samples, skipna=True) - np.testing.assert_almost_equal(actual_std, self.variance ** 0.5, - decimal=2) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, + check_less_precise=2) actual_std = nanops.nanvar(samples, skipna=False) - np.testing.assert_almost_equal(actual_std, np.nan, decimal=2) + tm.assert_almost_equal(actual_std, np.nan, + check_less_precise=2) def test_nanvar_axis(self): # Generate some sample data. @@ -831,8 +832,8 @@ def test_nanvar_axis(self): samples = np.vstack([samples_norm, samples_unif]) actual_variance = nanops.nanvar(samples, axis=1) - np.testing.assert_array_almost_equal(actual_variance, np.array( - [self.variance, 1.0 / 12]), decimal=2) + tm.assert_almost_equal(actual_variance, np.array( + [self.variance, 1.0 / 12]), check_less_precise=2) def test_nanvar_ddof(self): n = 5 @@ -845,13 +846,16 @@ def test_nanvar_ddof(self): # The unbiased estimate. var = 1.0 / 12 - np.testing.assert_almost_equal(variance_1, var, decimal=2) + tm.assert_almost_equal(variance_1, var, + check_less_precise=2) + # The underestimated variance. - np.testing.assert_almost_equal(variance_0, (n - 1.0) / n * var, - decimal=2) + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, + check_less_precise=2) + # The overestimated variance. - np.testing.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, - decimal=2) + tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, + check_less_precise=2) def test_ground_truth(self): # Test against values that were precomputed with Numpy. diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 7792a1f5d3509..b1f09ad2685e3 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2301,8 +2301,8 @@ def test_update_raise(self): [[1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.], [1.5, np.nan, 3.]]]) - np.testing.assert_raises(Exception, pan.update, *(pan, ), - **{'raise_conflict': True}) + self.assertRaises(Exception, pan.update, *(pan, ), + **{'raise_conflict': True}) def test_all_any(self): self.assertTrue((self.panel.all(axis=0).values == nanall( diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index 9cc76591e9b7b..c4e864a909c03 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -519,12 +519,18 @@ def test_less_precise(self): self.assertRaises(AssertionError, assert_series_equal, s1, s2) self._assert_equal(s1, s2, check_less_precise=True) + for i in range(4): + self._assert_equal(s1, s2, check_less_precise=i) + self.assertRaises(AssertionError, assert_series_equal, s1, s2, 10) s1 = Series([0.12345], dtype='float32') s2 = Series([0.12346], dtype='float32') self.assertRaises(AssertionError, assert_series_equal, s1, s2) self._assert_equal(s1, s2, check_less_precise=True) + for i in range(4): + self._assert_equal(s1, s2, check_less_precise=i) + self.assertRaises(AssertionError, assert_series_equal, s1, s2, 10) # even less than less precise s1 = Series([0.1235], dtype='float32') diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f2b5bf7d2739d..ef94692ea9673 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -116,18 +116,40 @@ def assertNotAlmostEquals(self, *args, **kwargs): def assert_almost_equal(left, right, check_exact=False, - check_dtype='equiv', **kwargs): + check_dtype='equiv', check_less_precise=False, + **kwargs): + """Check that left and right Index are equal. + + Parameters + ---------- + left : object + right : object + check_exact : bool, default True + Whether to compare number exactly. + check_dtype: bool, default True + check dtype if both a and b are the same type + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare + """ if isinstance(left, pd.Index): return assert_index_equal(left, right, check_exact=check_exact, - exact=check_dtype, **kwargs) + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs) elif isinstance(left, pd.Series): return assert_series_equal(left, right, check_exact=check_exact, - check_dtype=check_dtype, **kwargs) + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs) elif isinstance(left, pd.DataFrame): return assert_frame_equal(left, right, check_exact=check_exact, - check_dtype=check_dtype, **kwargs) + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs) else: # other sequences @@ -142,8 +164,11 @@ def assert_almost_equal(left, right, check_exact=False, else: obj = 'Input' assert_class_equal(left, right, obj=obj) - return _testing.assert_almost_equal(left, right, - check_dtype=check_dtype, **kwargs) + return _testing.assert_almost_equal( + left, right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs) def assert_dict_equal(left, right, compare_keys=True): @@ -690,9 +715,10 @@ def assert_index_equal(left, right, exact='equiv', check_names=True, Int64Index as well check_names : bool, default True Whether to check the names attribute. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_exact : bool, default True Whether to compare number exactly. check_categorical : bool, default True @@ -1040,9 +1066,10 @@ def assert_series_equal(left, right, check_dtype=True, are identical. check_series_type : bool, default False Whether to check the Series class is identical. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_exact : bool, default False Whether to compare number exactly. check_names : bool, default True @@ -1106,7 +1133,7 @@ def assert_series_equal(left, right, check_dtype=True, check_dtype=check_dtype) else: _testing.assert_almost_equal(left.get_values(), right.get_values(), - check_less_precise, + check_less_precise=check_less_precise, check_dtype=check_dtype, obj='{0}'.format(obj)) @@ -1150,9 +1177,10 @@ def assert_frame_equal(left, right, check_dtype=True, are identical. check_frame_type : bool, default False Whether to check the DataFrame class is identical. - check_less_precise : bool, default False + check_less_precise : bool or it, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare check_names : bool, default True Whether to check the Index names attribute. by_blocks : bool, default False @@ -1259,9 +1287,10 @@ def assert_panelnd_equal(left, right, Whether to check the Panel dtype is identical. check_panel_type : bool, default False Whether to check the Panel class is identical. - check_less_precise : bool, default False + check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare assert_func : function for comparing data check_names : bool, default True Whether to check the Index names attribute. From 132c1c55c3d7884e149dd8f99655f1d2c720696c Mon Sep 17 00:00:00 2001 From: Piotr Jucha Date: Tue, 31 May 2016 10:02:12 -0400 Subject: [PATCH 75/96] BUG: Fix describe(): percentiles (#13104), col index (#13288) closes #13104 closes #13288 Author: Piotr Jucha Closes #13298 from pijucha/bug13104 and squashes the following commits: 9a6bd6e [Piotr Jucha] BUG: Fix describe(): percentiles (#13104), col index (#13288) --- doc/source/whatsnew/v0.18.2.txt | 49 ++++++++++++++++++++++ pandas/core/generic.py | 43 +++++++++---------- pandas/formats/format.py | 64 ++++++++++++++++++++++++++++- pandas/tests/formats/test_format.py | 15 +++++++ pandas/tests/test_generic.py | 63 ++++++++++++++++++++++++++++ 5 files changed, 212 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index be38adb96e403..b557861c1a375 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -229,6 +229,55 @@ resulting dtype will be upcast (unchanged from previous). pd.merge(df1, df2, how='outer', on='key') pd.merge(df1, df2, how='outer', on='key').dtypes +.. _whatsnew_0182.describe: + +``.describe()`` changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) + +.. ipython:: python + + s = pd.Series([0, 1, 2, 3, 4]) + df = pd.DataFrame([0, 1, 2, 3, 4]) + +Previous Behavior: + +The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. + +.. code-block:: ipython + + In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[3]: + count 5.000000 + mean 2.000000 + std 1.581139 + min 0.000000 + 0.0% 0.000400 + 0.1% 0.002000 + 0.1% 0.004000 + 50% 2.000000 + 99.9% 3.996000 + 100.0% 3.998000 + 100.0% 3.999600 + max 4.000000 + dtype: float64 + + In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[4]: + ... + ValueError: cannot reindex from a duplicate axis + +New Behavior: + +.. ipython:: python + + s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + +- Passing duplicated ``percentiles`` will now raise a ``ValueError``. +- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) + .. _whatsnew_0182.api.other: Other API changes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 99599d2b04a45..9ecaaebc2b523 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -20,6 +20,7 @@ import pandas.core.missing as missing import pandas.core.datetools as datetools from pandas.formats.printing import pprint_thing +from pandas.formats.format import format_percentiles from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, @@ -4868,32 +4869,33 @@ def abs(self): @Appender(_shared_docs['describe'] % _shared_doc_kwargs) def describe(self, percentiles=None, include=None, exclude=None): if self.ndim >= 3: - msg = "describe is not implemented on on Panel or PanelND objects." + msg = "describe is not implemented on Panel or PanelND objects." raise NotImplementedError(msg) + elif self.ndim == 2 and self.columns.size == 0: + raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: # get them all to be in [0, 1] self._check_percentile(percentiles) + + # median should always be included + if 0.5 not in percentiles: + percentiles.append(0.5) percentiles = np.asarray(percentiles) else: percentiles = np.array([0.25, 0.5, 0.75]) - # median should always be included - if (percentiles != 0.5).all(): # median isn't included - lh = percentiles[percentiles < .5] - uh = percentiles[percentiles > .5] - percentiles = np.hstack([lh, 0.5, uh]) + # sort and check for duplicates + unique_pcts = np.unique(percentiles) + if len(unique_pcts) < len(percentiles): + raise ValueError("percentiles cannot contain duplicates") + percentiles = unique_pcts - def pretty_name(x): - x *= 100 - if x == int(x): - return '%.0f%%' % x - else: - return '%.1f%%' % x + formatted_percentiles = format_percentiles(percentiles) - def describe_numeric_1d(series, percentiles): + def describe_numeric_1d(series): stat_index = (['count', 'mean', 'std', 'min'] + - [pretty_name(x) for x in percentiles] + ['max']) + formatted_percentiles + ['max']) d = ([series.count(), series.mean(), series.std(), series.min()] + [series.quantile(x) for x in percentiles] + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) @@ -4918,18 +4920,18 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) - def describe_1d(data, percentiles): + def describe_1d(data): if com.is_bool_dtype(data): return describe_categorical_1d(data) elif com.is_numeric_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) elif com.is_timedelta64_dtype(data): - return describe_numeric_1d(data, percentiles) + return describe_numeric_1d(data) else: return describe_categorical_1d(data) if self.ndim == 1: - return describe_1d(self, percentiles) + return describe_1d(self) elif (include is None) and (exclude is None): if len(self._get_numeric_data()._info_axis) > 0: # when some numerics are found, keep only numerics @@ -4944,7 +4946,7 @@ def describe_1d(data, percentiles): else: data = self.select_dtypes(include=include, exclude=exclude) - ldesc = [describe_1d(s, percentiles) for _, s in data.iteritems()] + ldesc = [describe_1d(s) for _, s in data.iteritems()] # set a convenient order for rows names = [] ldesc_indexes = sorted([x.index for x in ldesc], key=len) @@ -4954,8 +4956,7 @@ def describe_1d(data, percentiles): names.append(name) d = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1) - d.columns = self.columns._shallow_copy(values=d.columns.values) - d.columns.names = data.columns.names + d.columns = data.columns.copy() return d def _check_percentile(self, q): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 70b506a1415c1..27d8b553013b9 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -6,7 +6,7 @@ import sys from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull +from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -2260,6 +2260,68 @@ def _format_strings(self): return fmt_values +def format_percentiles(percentiles): + """ + Outputs rounded and formatted percentiles. + + Parameters + ---------- + percentiles : list-like, containing floats from interval [0,1] + + Returns + ------- + formatted : list of strings + + Notes + ----- + Rounding precision is chosen so that: (1) if any two elements of + ``percentiles`` differ, they remain different after rounding + (2) no entry is *rounded* to 0% or 100%. + Any non-integer is always rounded to at least 1 decimal place. + + Examples + -------- + Keeps all entries different after rounding: + + >>> format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + + No element is rounded to 0% or 100% (unless already equal to it). + Duplicates are allowed: + + >>> format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + """ + + percentiles = np.asarray(percentiles) + + # It checks for np.NaN as well + if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ + or not np.all(percentiles <= 1): + raise ValueError("percentiles should all be in the interval [0,1]") + + percentiles = 100 * percentiles + int_idx = (percentiles.astype(int) == percentiles) + + if np.all(int_idx): + out = percentiles.astype(int).astype(str) + return [i + '%' for i in out] + + unique_pcts = np.unique(percentiles) + to_begin = unique_pcts[0] if unique_pcts[0] > 0 else None + to_end = 100 - unique_pcts[-1] if unique_pcts[-1] < 100 else None + + # Least precision that keeps percentiles unique after rounding + prec = -np.floor(np.log10(np.min( + np.ediff1d(unique_pcts, to_begin=to_begin, to_end=to_end) + ))).astype(int) + prec = max(1, prec) + out = np.empty_like(percentiles, dtype=object) + out[int_idx] = percentiles[int_idx].astype(int).astype(str) + out[~int_idx] = percentiles[~int_idx].round(prec).astype(str) + return [i + '%' for i in out] + + def _is_dates_only(values): # return a boolean if we are only dates (and don't have a timezone) values = DatetimeIndex(values) diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index 7a806280916f1..e67fe2cddde77 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -4264,6 +4264,21 @@ def test_nat_representations(self): self.assertEqual(f(pd.NaT), 'NaT') +def test_format_percentiles(): + result = fmt.format_percentiles([0.01999, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['1.999%', '2.001%', '50%', '66.667%', '99.99%'] + tm.assert_equal(result, expected) + + result = fmt.format_percentiles([0, 0.5, 0.02001, 0.5, 0.666666, 0.9999]) + expected = ['0%', '50%', '2.0%', '50%', '66.67%', '99.99%'] + tm.assert_equal(result, expected) + + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, np.nan, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [-0.001, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [2, 0.1, 0.5]) + tm.assertRaises(ValueError, fmt.format_percentiles, [0.1, 0.5, 'a']) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 83e1a17fc8b0c..2f4c2b414cc30 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -996,6 +996,59 @@ def test_describe_percentiles_insert_median(self): self.assertTrue('0%' in d1.index) self.assertTrue('100%' in d2.index) + def test_describe_percentiles_unique(self): + # GH13104 + df = tm.makeDataFrame() + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.5, 0.2, 0.6]) + with self.assertRaises(ValueError): + df.describe(percentiles=[0.1, 0.2, 0.4, 0.2, 0.6]) + + def test_describe_percentiles_formatting(self): + # GH13104 + df = tm.makeDataFrame() + + # default + result = df.describe().index + expected = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', + 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, + 0.9995, 0.9999]).index + expected = Index(['count', 'mean', 'std', 'min', '0.01%', '0.05%', + '0.1%', '50%', '99.9%', '99.95%', '99.99%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.005, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.499%', '0.5%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + result = df.describe(percentiles=[0.00499, 0.01001, 0.25, 0.50, + 0.75]).index + expected = Index(['count', 'mean', 'std', 'min', '0.5%', '1.0%', + '25%', '50%', '75%', 'max'], + dtype='object') + tm.assert_index_equal(result, expected) + + def test_describe_column_index_type(self): + # GH13288 + df = pd.DataFrame([1, 2, 3, 4]) + df.columns = pd.Index([0], dtype=object) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + + df = pd.DataFrame({'A': list("BCDE"), 0: [1, 2, 3, 4]}) + result = df.describe().columns + expected = Index([0], dtype=object) + tm.assert_index_equal(result, expected) + def test_describe_no_numeric(self): df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8, 'B': ['a', 'b', 'c', 'd'] * 6}) @@ -1010,6 +1063,16 @@ def test_describe_no_numeric(self): desc = df.describe() self.assertEqual(desc.time['first'], min(ts.index)) + def test_describe_empty(self): + df = DataFrame() + tm.assertRaisesRegexp(ValueError, 'DataFrame without columns', + df.describe) + + df = DataFrame(columns=['A', 'B']) + result = df.describe() + expected = DataFrame(0, columns=['A', 'B'], index=['count', 'unique']) + tm.assert_frame_equal(result, expected) + def test_describe_empty_int_columns(self): df = DataFrame([[0, 1], [1, 2]]) desc = df[df[0] < 0].describe() # works From d1916404bc604ebb319710740b5dcc7eabd4fc89 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 31 May 2016 10:30:23 -0400 Subject: [PATCH 76/96] ENH: Respect Key Ordering for OrderedDict List in DataFrame Init Title is self-explanatory. Closes #13304. Author: gfyoung Closes #13309 from gfyoung/ordereddict-key-ordering-init and squashes the following commits: 4f311cc [gfyoung] ENH: Respect key ordering for OrderedDict list in DataFrame init --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/frame.py | 3 +- pandas/lib.pyx | 26 ++++++++++---- pandas/tests/frame/test_constructors.py | 45 +++++++++++++++++++++++++ pandas/tests/test_lib.py | 13 +++++++ 5 files changed, 81 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b557861c1a375..984001db5783c 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -88,6 +88,7 @@ Other enhancements - ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) - Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c8106571f198..69def7502a6f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5537,7 +5537,8 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None): def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: gen = (list(x.keys()) for x in data) - columns = lib.fast_unique_multiple_list_gen(gen) + sort = not any(isinstance(d, OrderedDict) for d in data) + columns = lib.fast_unique_multiple_list_gen(gen, sort=sort) # assure that they are of the base dict class and not of derived # classes diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 328166168a3fc..a9c7f93097f1b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -493,7 +493,21 @@ def fast_unique_multiple_list(list lists): @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple_list_gen(object gen): +def fast_unique_multiple_list_gen(object gen, bint sort=True): + """ + Generate a list of unique values from a generator of lists. + + Parameters + ---------- + gen : generator object + A generator of lists from which the unique list is created + sort : boolean + Whether or not to sort the resulting unique list + + Returns + ------- + unique_list : list of unique values + """ cdef: list buf Py_ssize_t j, n @@ -508,11 +522,11 @@ def fast_unique_multiple_list_gen(object gen): if val not in table: table[val] = stub uniques.append(val) - - try: - uniques.sort() - except Exception: - pass + if sort: + try: + uniques.sort() + except Exception: + pass return uniques diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a050d74f0fc51..b42aef9447373 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -891,6 +891,45 @@ def test_constructor_list_of_dicts(self): expected = DataFrame(index=[0]) tm.assert_frame_equal(result, expected) + def test_constructor_ordered_dict_preserve_order(self): + # see gh-13304 + expected = DataFrame([[2, 1]], columns=['b', 'a']) + + data = OrderedDict() + data['b'] = [2] + data['a'] = [1] + + result = DataFrame(data) + tm.assert_frame_equal(result, expected) + + data = OrderedDict() + data['b'] = 2 + data['a'] = 1 + + result = DataFrame([data]) + tm.assert_frame_equal(result, expected) + + def test_constructor_ordered_dict_conflicting_orders(self): + # the first dict element sets the ordering for the DataFrame, + # even if there are conflicting orders from subsequent ones + row_one = OrderedDict() + row_one['b'] = 2 + row_one['a'] = 1 + + row_two = OrderedDict() + row_two['a'] = 1 + row_two['b'] = 2 + + row_three = {'b': 2, 'a': 1} + + expected = DataFrame([[2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two]) + tm.assert_frame_equal(result, expected) + + expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=['b', 'a']) + result = DataFrame([row_one, row_two, row_three]) + tm.assert_frame_equal(result, expected) + def test_constructor_list_of_series(self): data = [OrderedDict([['a', 1.5], ['b', 3.0], ['c', 4.0]]), OrderedDict([['a', 1.5], ['b', 3.0], ['c', 6.0]])] @@ -1870,3 +1909,9 @@ def test_from_index(self): tm.assert_series_equal(df2[0], Series(idx2, name=0)) df2 = DataFrame(Series(idx2)) tm.assert_series_equal(df2[0], Series(idx2, name=0)) + +if __name__ == '__main__': + import nose # noqa + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index bfac0aa83b434..10a6bb5c75b01 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -24,6 +24,19 @@ def test_max_len_string_array(self): tm.assertRaises(TypeError, lambda: lib.max_len_string_array(arr.astype('U'))) + def test_fast_unique_multiple_list_gen_sort(self): + keys = [['p', 'a'], ['n', 'd'], ['a', 's']] + + gen = (key for key in keys) + expected = np.array(['a', 'd', 'n', 'p', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=True) + tm.assert_numpy_array_equal(np.array(out), expected) + + gen = (key for key in keys) + expected = np.array(['p', 'a', 'n', 'd', 's']) + out = lib.fast_unique_multiple_list_gen(gen, sort=False) + tm.assert_numpy_array_equal(np.array(out), expected) + class TestIndexing(tm.TestCase): From f3d7c18190045bdf02f4448e46880a7030829716 Mon Sep 17 00:00:00 2001 From: Roger Thomas Date: Tue, 31 May 2016 11:39:48 -0400 Subject: [PATCH 77/96] BUG: Fix maybe_convert_numeric for unhashable objects closes #13324 Author: Roger Thomas Closes #13326 from RogerThomas/fix_maybe_convert_numeric_for_unhashable_objects and squashes the following commits: 76a0738 [Roger Thomas] Fix maybe_convert_numeric for unhashable objects --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/src/inference.pyx | 2 +- pandas/tests/test_infer_and_convert.py | 7 +++++++ pandas/tools/tests/test_util.py | 12 ++++++++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 984001db5783c..33a48671a9b65 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -368,6 +368,7 @@ Bug Fixes - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) +- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index e2c59a34bdf21..d4e149eb09b65 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -569,7 +569,7 @@ def maybe_convert_numeric(object[:] values, set na_values, for i in range(n): val = values[i] - if val in na_values: + if val.__hash__ is not None and val in na_values: floats[i] = complexes[i] = nan seen_float = True elif util.is_float_object(val): diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 06e2a82e07dee..7558934c32bc8 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -102,6 +102,13 @@ def test_scientific_no_exponent(self): result = lib.maybe_convert_numeric(arr, set(), False, True) self.assertTrue(np.all(np.isnan(result))) + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, 'apple']) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + class TestTypeInference(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index 4e704554f982f..c592b33bdab9a 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -279,6 +279,18 @@ def test_period(self): # res = pd.to_numeric(pd.Series(idx, name='xxx')) # tm.assert_series_equal(res, pd.Series(idx.asi8, name='xxx')) + def test_non_hashable(self): + # Test for Bug #13324 + s = pd.Series([[10.0, 2], 1.0, 'apple']) + res = pd.to_numeric(s, errors='coerce') + tm.assert_series_equal(res, pd.Series([np.nan, 1.0, np.nan])) + + res = pd.to_numeric(s, errors='ignore') + tm.assert_series_equal(res, pd.Series([[10.0, 2], 1.0, 'apple'])) + + with self.assertRaisesRegexp(TypeError, "Invalid object type"): + pd.to_numeric(s) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 8bbd2bc8f148ec54b0cb8af9d9584816722c7e9f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 31 May 2016 17:46:59 -0400 Subject: [PATCH 78/96] ENH: Series has gained the properties .is_monotonic* Author: Jeff Reback Closes #13336 from jreback/is_monotonic and squashes the following commits: 0a50ff9 [Jeff Reback] ENH: Series has gained the properties .is_monotonic, .is_monotonic_increasing, .is_monotonic_decreasing --- doc/source/api.rst | 3 +++ doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/core/base.py | 31 +++++++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 17 +++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 9e7ae2357c541..0e893308dd935 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -354,6 +354,9 @@ Computations / Descriptive Stats Series.unique Series.nunique Series.is_unique + Series.is_monotonic + Series.is_monotonic_increasing + Series.is_monotonic_decreasing Series.value_counts Reindexing / Selection / Label manipulation diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 33a48671a9b65..3fc1a69cb600e 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -92,7 +92,7 @@ Other enhancements - ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) - +- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) .. _whatsnew_0182.api: diff --git a/pandas/core/base.py b/pandas/core/base.py index 36f1f24fec6f7..96732a7140f9e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -995,6 +995,37 @@ def is_unique(self): """ return self.nunique() == len(self) + @property + def is_monotonic(self): + """ + Return boolean if values in the object are + monotonic_increasing + + .. versionadded:: 0.18.2 + + Returns + ------- + is_monotonic : boolean + """ + from pandas import Index + return Index(self).is_monotonic + is_monotonic_increasing = is_monotonic + + @property + def is_monotonic_decreasing(self): + """ + Return boolean if values in the object are + monotonic_decreasing + + .. versionadded:: 0.18.2 + + Returns + ------- + is_monotonic_decreasing : boolean + """ + from pandas import Index + return Index(self).is_monotonic_decreasing + def memory_usage(self, deep=False): """ Memory usage of my values diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index c190b0d9e3bb0..433f0f4bc67f5 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1397,6 +1397,23 @@ def test_is_unique(self): s = Series(np.arange(1000)) self.assertTrue(s.is_unique) + def test_is_monotonic(self): + + s = Series(np.random.randint(0, 10, size=1000)) + self.assertFalse(s.is_monotonic) + s = Series(np.arange(1000)) + self.assertTrue(s.is_monotonic) + self.assertTrue(s.is_monotonic_increasing) + s = Series(np.arange(1000, 0, -1)) + self.assertTrue(s.is_monotonic_decreasing) + + s = Series(pd.date_range('20130101', periods=10)) + self.assertTrue(s.is_monotonic) + self.assertTrue(s.is_monotonic_increasing) + s = Series(list(reversed(s.tolist()))) + self.assertFalse(s.is_monotonic) + self.assertTrue(s.is_monotonic_decreasing) + def test_sort_values(self): ts = self.ts.copy() From 2e3c82e81bf00af157268a842a270a6181fcb168 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 31 May 2016 19:17:56 -0400 Subject: [PATCH 79/96] TST: computation/test_eval.py tests (slow) closes #13338 Author: Jeff Reback Closes #13339 from jreback/eval and squashes the following commits: b2ee5e8 [Jeff Reback] TST: computation/test_eval.py tests (slow) --- pandas/computation/tests/test_eval.py | 22 +++++++++++++--------- pandas/util/testing.py | 6 +++++- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 4dc1e24618a83..5019dd392a567 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -248,7 +248,8 @@ def check_operands(left, right, cmp_op): for ex in (ex1, ex2, ex3): result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(result, expected) + + tm.assert_almost_equal(result, expected) def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) @@ -265,7 +266,8 @@ def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) - tm.assert_numpy_array_equal(result, expected) + + tm.assert_almost_equal(result, expected) ex = 'lhs {0} rhs {0} rhs'.format(arith1) result = pd.eval(ex, engine=self.engine, parser=self.parser) nlhs = _eval_single_bin(lhs, arith1, rhs, @@ -280,8 +282,10 @@ def check_alignment(self, result, nlhs, ghs, op): # TypeError, AttributeError: series or frame with scalar align pass else: + + # direct numpy comparison expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result.values, expected) # modulus, pow, and floor division require special casing @@ -349,12 +353,12 @@ def check_single_invert_op(self, lhs, cmp1, rhs): elb = np.array([bool(el)]) expected = ~elb result = pd.eval('~elb', engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(expected, result) + tm.assert_almost_equal(expected, result) for engine in self.current_engines: tm.skip_if_no_ne(engine) - tm.assert_numpy_array_equal(result, pd.eval('~elb', engine=engine, - parser=self.parser)) + tm.assert_almost_equal(result, pd.eval('~elb', engine=engine, + parser=self.parser)) def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' @@ -374,13 +378,13 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): else: expected = ~expected result = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(expected, result) + tm.assert_almost_equal(expected, result) # make sure the other engines work the same as this one for engine in self.current_engines: tm.skip_if_no_ne(engine) ev = pd.eval(ex, engine=self.engine, parser=self.parser) - tm.assert_numpy_array_equal(ev, result) + tm.assert_almost_equal(ev, result) def ex(self, op, var_name='lhs'): return '{0}{1}'.format(op, var_name) @@ -728,7 +732,7 @@ def check_alignment(self, result, nlhs, ghs, op): pass else: expected = eval('nlhs {0} ghs'.format(op)) - tm.assert_numpy_array_equal(result, expected) + tm.assert_almost_equal(result, expected) class TestEvalPythonPandas(TestEvalPythonPython): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index ef94692ea9673..03ccfcab24f58 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -25,7 +25,8 @@ import pandas as pd from pandas.core.common import (is_sequence, array_equivalent, is_list_like, is_datetimelike_v_numeric, - is_datetimelike_v_object, is_number, + is_datetimelike_v_object, + is_number, is_bool, needs_i8_conversion, is_categorical_dtype) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -157,6 +158,9 @@ def assert_almost_equal(left, right, check_exact=False, if is_number(left) and is_number(right): # do not compare numeric classes, like np.float64 and float pass + elif is_bool(left) and is_bool(right): + # do not compare bool classes, like np.bool_ and bool + pass else: if (isinstance(left, np.ndarray) or isinstance(right, np.ndarray)): From 45bab82462235141d0697f75bce4e97a789b7398 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Wed, 1 Jun 2016 06:59:44 -0400 Subject: [PATCH 80/96] BUG: Parse trailing NaN values for the Python parser Fixes bug in which the Python parser failed to detect trailing `NaN` values in rows Author: gfyoung Closes #13320 from gfyoung/trailing-nan-conversion and squashes the following commits: 590874d [gfyoung] BUG: Parse trailing NaN values for the Python parser --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 8 +++++--- pandas/io/tests/parser/c_parser_only.py | 9 --------- pandas/io/tests/parser/na_values.py | 9 +++++++++ pandas/src/inference.pyx | 21 +++++++++++++++++++-- pandas/tests/test_infer_and_convert.py | 17 +++++++++++++++++ 6 files changed, 51 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 3fc1a69cb600e..37a18817f3627 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -349,6 +349,7 @@ Bug Fixes - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 394fe1a98880a..1f0155c4cc7a0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2226,14 +2226,16 @@ def _get_index_name(self, columns): return index_name, orig_names, columns def _rows_to_cols(self, content): - zipped_content = list(lib.to_object_array(content).T) - col_len = self.num_original_columns - zip_len = len(zipped_content) if self._implicit_index: col_len += len(self.index_col) + # see gh-13320 + zipped_content = list(lib.to_object_array( + content, min_width=col_len).T) + zip_len = len(zipped_content) + if self.skip_footer < 0: raise ValueError('skip footer cannot be negative') diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 00c4e0a1c022b..7fca37cef473e 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -360,15 +360,6 @@ def test_raise_on_passed_int_dtype_with_nas(self): sep=",", skipinitialspace=True, dtype={'DOY': np.int64}) - def test_na_trailing_columns(self): - data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax -2012-03-14,USD,AAPL,BUY,1000 -2012-05-12,USD,SBUX,SELL,500""" - - result = self.read_csv(StringIO(data)) - self.assertEqual(result['Date'][1], '2012-05-12') - self.assertTrue(result['UnitPrice'].isnull().all()) - def test_parse_ragged_csv(self): data = """1,2,3 1,2,3,4 diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index d826ae536c6cc..2a8c934abce61 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -241,3 +241,12 @@ def test_na_values_na_filter_override(self): columns=['A', 'B']) out = self.read_csv(StringIO(data), na_values=['B'], na_filter=False) tm.assert_frame_equal(out, expected) + + def test_na_trailing_columns(self): + data = """Date,Currenncy,Symbol,Type,Units,UnitPrice,Cost,Tax +2012-03-14,USD,AAPL,BUY,1000 +2012-05-12,USD,SBUX,SELL,500""" + + result = self.read_csv(StringIO(data)) + self.assertEqual(result['Date'][1], '2012-05-12') + self.assertTrue(result['UnitPrice'].isnull().all()) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index d4e149eb09b65..5f7c5478b5d87 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -1132,7 +1132,24 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(list rows): +def to_object_array(list rows, int min_width=0): + """ + Convert a list of lists into an object array. + + Parameters + ---------- + rows : 2-d array (N, K) + A list of lists to be converted into an array + min_width : int + The minimum width of the object array. If a list + in `rows` contains fewer than `width` elements, + the remaining elements in the corresponding row + will all be `NaN`. + + Returns + ------- + obj_array : numpy array of the object dtype + """ cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result @@ -1140,7 +1157,7 @@ def to_object_array(list rows): n = len(rows) - k = 0 + k = min_width for i from 0 <= i < n: tmp = len(rows[i]) if tmp > k: diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 7558934c32bc8..68eac12e5ec4c 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -201,6 +201,23 @@ def test_to_object_array_tuples(self): except ImportError: pass + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + def test_object(self): # GH 7431 From fcd73ad2e7482414b61d47056c6c9c220b11702c Mon Sep 17 00:00:00 2001 From: Hassan Shamim Date: Thu, 19 May 2016 14:13:04 -0700 Subject: [PATCH 81/96] BUG: GH13219 Fixed. Allow unicode values in usecols closes #13219 closes #13233 --- doc/source/whatsnew/v0.18.2.txt | 5 +- pandas/io/parsers.py | 9 +-- pandas/io/tests/parser/usecols.py | 106 +++++++++++++++++++++++++++++- 3 files changed, 111 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 37a18817f3627..27540a9626398 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -338,18 +338,19 @@ Bug Fixes - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``pd.read_csv`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) - Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) - Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 1f0155c4cc7a0..bba8ad3ccd72b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -882,12 +882,13 @@ def _validate_usecols_arg(usecols): or strings (column by name). Raises a ValueError if that is not the case. """ + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + if usecols is not None: usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('integer', 'string'): - raise ValueError(("The elements of 'usecols' " - "must either be all strings " - "or all integers")) + if usecols_dtype not in ('integer', 'string', 'unicode'): + raise ValueError(msg) return usecols diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 06275c168becd..0d3ae95f0d1d4 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -6,6 +6,7 @@ """ from datetime import datetime +import nose import pandas.util.testing as tm @@ -22,9 +23,8 @@ def test_raise_on_mixed_dtype_usecols(self): 1000,2000,3000 4000,5000,6000 """ - msg = ("The elements of \'usecols\' " - "must either be all strings " - "or all integers") + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") usecols = [0, 'b', 2] with tm.assertRaisesRegexp(ValueError, msg): @@ -254,3 +254,103 @@ def test_usecols_with_parse_dates_and_usecol_names(self): usecols=[3, 0, 2], parse_dates=parse_dates) tm.assert_frame_equal(df, expected) + + def test_usecols_with_unicode_strings(self): + # see gh-13219 + + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'AAA': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'BBB': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'AAA', u'BBB']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_single_byte_unicode_strings(self): + # see gh-13219 + + s = '''A,B,C,D + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + data = { + 'A': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'B': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'A', u'B']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_mixed_encoding_strings(self): + s = '''AAA,BBB,CCC,DDD + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + + msg = ("The elements of 'usecols' must " + "either be all strings, all unicode, or all integers") + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[u'AAA', b'BBB']) + + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(s), usecols=[b'AAA', u'BBB']) + + def test_usecols_with_multibyte_characters(self): + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=['あああ', 'いい']) + tm.assert_frame_equal(df, expected) + + def test_usecols_with_multibyte_unicode_characters(self): + raise nose.SkipTest('TODO: see gh-13253') + + s = '''あああ,いい,ううう,ええええ + 0.056674973,8,True,a + 2.613230982,2,False,b + 3.568935038,7,False,a + ''' + data = { + 'あああ': { + 0: 0.056674972999999997, + 1: 2.6132309819999997, + 2: 3.5689350380000002 + }, + 'いい': {0: 8, 1: 2, 2: 7} + } + expected = DataFrame(data) + + df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) + tm.assert_frame_equal(df, expected) From 99e78da19d9d267fd4e30c0a0268c072856df26e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 2 Jun 2016 13:12:20 -0400 Subject: [PATCH 82/96] DOC: fix comment on previous versions cythonmagic Small thing I just noticed in the docs (the note on the other version was not updated when the example was changed from cythonmagic -> Cython) Author: Joris Van den Bossche Closes #13343 from jorisvandenbossche/doc-cythonmagic and squashes the following commits: 902352c [Joris Van den Bossche] DOC: fix comment on previous versions cythonmagic --- doc/source/enhancingperf.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index a4db4b7c0d953..685a8690a53d5 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -95,7 +95,7 @@ Plain cython ~~~~~~~~~~~~ First we're going to need to import the cython magic function to ipython (for -cython versions >=0.21 you can use ``%load_ext Cython``): +cython versions < 0.21 you can use ``%load_ext cythonmagic``): .. ipython:: python :okwarning: From ce56542d1226adf8b3439c51f0c34b49dd53bb28 Mon Sep 17 00:00:00 2001 From: Uwe Hoffmann Date: Thu, 2 Jun 2016 13:31:22 -0400 Subject: [PATCH 83/96] Fix #13306: Hour overflow in tz-aware datetime conversions. closes #13306 Author: Uwe Hoffmann Closes #13313 from uwedeportivo/master and squashes the following commits: be3ed90 [Uwe Hoffmann] whatsnew entry for issue #13306 1f5f7a5 [Uwe Hoffmann] Code Review jreback 82f263a [Uwe Hoffmann] Use vectorized searchsorted and tests. a1ed5a5 [Uwe Hoffmann] Fix #13306: Hour overflow in tz-aware datetime conversions. --- doc/source/whatsnew/v0.18.2.txt | 2 +- pandas/tseries/tests/test_timezones.py | 82 ++++++++++++++++++++++++++ pandas/tslib.pyx | 35 +++++------ 3 files changed, 99 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 27540a9626398..950bf397f43b5 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -338,7 +338,7 @@ Bug Fixes - Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index b80ee4c5c1e39..afe9d0652db19 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -902,6 +902,88 @@ def test_utc_with_system_utc(self): # check that the time hasn't changed. self.assertEqual(ts, ts.tz_convert(dateutil.tz.tzutc())) + def test_tz_convert_hour_overflow_dst(self): + # Regression test for: + # https://github.com/pydata/pandas/issues/13306 + + # sorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2009-05-12 09:50:32'] + tt = to_datetime(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2009-05-12 13:50:32'] + tt = to_datetime(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = ['2008-05-12 09:50:00', + '2008-12-12 09:50:35', + '2008-05-12 09:50:32'] + tt = to_datetime(ts).tz_localize('US/Eastern') + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = ['2008-05-12 13:50:00', + '2008-12-12 14:50:35', + '2008-05-12 13:50:32'] + tt = to_datetime(ts).tz_localize('UTC') + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + def test_tz_convert_hour_overflow_dst_timestamps(self): + # Regression test for: + # https://github.com/pydata/pandas/issues/13306 + + tz = self.tzstr('US/Eastern') + + # sorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2009-05-12 09:50:32', tz=tz)] + tt = to_datetime(ts) + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # sorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2009-05-12 13:50:32', tz='UTC')] + tt = to_datetime(ts) + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case US/Eastern -> UTC + ts = [Timestamp('2008-05-12 09:50:00', tz=tz), + Timestamp('2008-12-12 09:50:35', tz=tz), + Timestamp('2008-05-12 09:50:32', tz=tz)] + tt = to_datetime(ts) + ut = tt.tz_convert('UTC') + expected = np.array([13, 14, 13], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + + # unsorted case UTC -> US/Eastern + ts = [Timestamp('2008-05-12 13:50:00', tz='UTC'), + Timestamp('2008-12-12 14:50:35', tz='UTC'), + Timestamp('2008-05-12 13:50:32', tz='UTC')] + tt = to_datetime(ts) + ut = tt.tz_convert('US/Eastern') + expected = np.array([9, 9, 9], dtype=np.int32) + self.assert_numpy_array_equal(ut.hour, expected) + def test_tslib_tz_convert_trans_pos_plus_1__bug(self): # Regression test for tslib.tz_convert(vals, tz1, tz2). # See https://github.com/pydata/pandas/issues/4496 for details. diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index b3fb4989b2f23..6453e65ecdc81 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3754,8 +3754,8 @@ except: def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: - ndarray[int64_t] utc_dates, tt, result, trans, deltas - Py_ssize_t i, pos, n = len(vals) + ndarray[int64_t] utc_dates, tt, result, trans, deltas, posn + Py_ssize_t i, j, pos, n = len(vals) int64_t v, offset pandas_datetimestruct dts Py_ssize_t trans_len @@ -3791,19 +3791,18 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return vals trans_len = len(trans) - pos = trans.searchsorted(tt[0]) - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - - offset = deltas[pos] + posn = trans.searchsorted(tt, side='right') + j = 0 for i in range(n): v = vals[i] if v == NPY_NAT: utc_dates[i] = NPY_NAT else: - while pos + 1 < trans_len and v >= trans[pos + 1]: - pos += 1 - offset = deltas[pos] + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] utc_dates[i] = v - offset else: utc_dates = vals @@ -3838,20 +3837,18 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if (result==NPY_NAT).all(): return result - pos = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT][0]) - 1 - if pos < 0: - raise ValueError('First time before start of DST info') - - # TODO: this assumed sortedness :/ - offset = deltas[pos] + posn = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT], side='right') + j = 0 for i in range(n): v = utc_dates[i] if vals[i] == NPY_NAT: result[i] = vals[i] else: - while pos + 1 < trans_len and v >= trans[pos + 1]: - pos += 1 - offset = deltas[pos] + pos = posn[j] - 1 + j = j + 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] result[i] = v + offset return result From 0c6226cbbc319ec22cf4c957bdcc055eaa7aea99 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Thu, 2 Jun 2016 19:15:18 -0400 Subject: [PATCH 84/96] ENH: Add support for compact_ints and use_unsigned in Python engine Title is self-explanatory. xref #12686 - I don't quite understand why these are marked (if at all) as internal to the C engine only, as the benefits for having these options accepted for the Python engine is quite clear based on the documentation I added as well. Implementation simply just calls the already-written function in `pandas/parsers.pyx` - as it isn't specific to the `TextReader` class, crossing over to grab this function from Cython (instead of duplicating in pure Python) seems reasonable while maintaining that separation between the C and Python engines. Author: gfyoung Closes #13323 from gfyoung/python-engine-compact-ints and squashes the following commits: 95f7ba8 [gfyoung] ENH: Add support for compact_ints and use_unsigned in Python engine --- doc/source/io.rst | 11 +++ doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/parsers.py | 35 ++++++++- pandas/io/tests/parser/c_parser_only.py | 46 ++++-------- pandas/io/tests/parser/common.py | 43 +++++++++++ pandas/io/tests/parser/test_unsupported.py | 21 ++++++ pandas/parser.pyx | 72 +----------------- pandas/src/inference.pyx | 85 ++++++++++++++++++++++ pandas/tests/test_infer_and_convert.py | 36 +++++++++ 9 files changed, 246 insertions(+), 104 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 6cf41bbc50fb5..4eb42e1fb918d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,6 +176,17 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the + parser will attempt to cast it as the smallest integer ``dtype`` possible, either + signed or unsigned depending on the specification from the ``use_unsigned`` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether + the column should be compacted to the smallest signed or unsigned integer dtype. NA and Missing Data Handling ++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 950bf397f43b5..b87cdd91aa464 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -292,6 +292,7 @@ Other API changes Deprecations ^^^^^^^^^^^^ +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bba8ad3ccd72b..2c8726f588522 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -227,6 +227,20 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If compact_ints is True, then for any column that is of integer dtype, + the parser will attempt to cast it as the smallest integer dtype possible, + either signed or unsigned depending on the specification from the + `use_unsigned` parameter. + +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. `compact_ints=True`), specify + whether the column should be compacted to the smallest signed or unsigned + integer dtype. Returns ------- @@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds): _c_unsupported = set(['skip_footer']) _python_unsupported = set([ 'as_recarray', - 'compact_ints', - 'use_unsigned', 'low_memory', 'memory_map', 'buffer_lines', @@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds): 'dtype', 'float_precision', ]) +_deprecated_args = set([ + 'compact_ints', + 'use_unsigned', +]) def _make_parser_function(name, sep=','): @@ -789,6 +805,12 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + for arg in _deprecated_args: + if result[arg] != _c_parser_defaults[arg]: + warnings.warn("The '{arg}' argument has been deprecated " + "and will be removed in a future version" + .format(arg=arg), FutureWarning, stacklevel=2) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) + + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: + cvals = lib.downcast_int64( + cvals, _parser.na_values, + self.use_unsigned) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.compact_ints = kwds['compact_ints'] + self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] + self.comment = kwds['comment'] self._comment_lines = [] diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 7fca37cef473e..b7ef754004e18 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -172,28 +172,8 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints(self): - if compat.is_platform_windows() and not self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: + if compat.is_platform_windows(): raise nose.SkipTest( "segfaults on win-64, only when all tests are run") @@ -201,16 +181,20 @@ def test_compact_ints_as_recarray(self): '1,1,0,0\n' '0,1,0,1') - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) def test_pass_dtype(self): data = """\ diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 44892dc17c47b..f8c7241fdf88a 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self): # test with more than a single newline data = "\n\n\n" self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + def test_compact_ints_use_unsigned(self): + # see gh-13323 + data = 'a,b,c\n1,9,258' + + # sanity check + expected = DataFrame({ + 'a': np.array([1], dtype=np.int64), + 'b': np.array([9], dtype=np.int64), + 'c': np.array([258], dtype=np.int64), + }) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.int8), + 'b': np.array([9], dtype=np.int8), + 'c': np.array([258], dtype=np.int16), + }) + + # default behaviour for 'use_unsigned' + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True) + tm.assert_frame_equal(out, expected) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=False) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.uint8), + 'b': np.array([9], dtype=np.uint8), + 'c': np.array([258], dtype=np.uint16), + }) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=True) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index 3c1c45831e7b4..e820924d2be8b 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -117,6 +117,27 @@ def test_python_engine(self): with tm.assertRaisesRegexp(ValueError, msg): read_csv(StringIO(data), engine=engine, **kwargs) + +class TestDeprecatedFeatures(tm.TestCase): + def test_deprecated_args(self): + data = '1,2,3' + + # deprecated arguments with non-default values + deprecated = { + 'compact_ints': True, + 'use_unsigned': True, + } + + engines = 'c', 'python' + + for engine in engines: + for arg, non_default_val in deprecated.items(): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + kwargs = {arg: non_default_val} + read_csv(StringIO(data), engine=engine, + **kwargs) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 729e5af528b80..d7ddaee658fe7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -1018,7 +1018,7 @@ cdef class TextReader: col_res = _maybe_upcast(col_res) if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = downcast_int64(col_res, self.use_unsigned) + col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser): raise CParserError(message) -def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr - - def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5f7c5478b5d87..262e036ff44f1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,6 +6,20 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + # core.common import for fast inference checks def is_float(object obj): return util.is_float_object(obj) @@ -1240,3 +1254,74 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +def downcast_int64(ndarray[int64_t] arr, object na_values, + bint use_unsigned=0): + cdef: + Py_ssize_t i, n = len(arr) + int64_t mx = INT64_MIN + 1, mn = INT64_MAX + int64_t NA = na_values[np.int64] + int64_t val + ndarray[uint8_t] mask + int na_count = 0 + + _mask = np.empty(n, dtype=bool) + mask = _mask.view(np.uint8) + + for i in range(n): + val = arr[i] + + if val == NA: + mask[i] = 1 + na_count += 1 + continue + + # not NA + mask[i] = 0 + + if val > mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py index 68eac12e5ec4c..a6941369b35be 100644 --- a/pandas/tests/test_infer_and_convert.py +++ b/pandas/tests/test_infer_and_convert.py @@ -401,6 +401,42 @@ def test_convert_sql_column_decimals(self): expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') self.assert_numpy_array_equal(result, expected) + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + if __name__ == '__main__': import nose From 2061e9e5fbbd890c484b53232b0747e08d7d1739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?G=C3=A1bor=20Lipt=C3=A1k?= Date: Fri, 3 Jun 2016 11:00:50 -0400 Subject: [PATCH 85/96] BUG: Fix series comparison operators when dealing with zero rank numpy arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit closes #13006 Author: Gábor Lipták Closes #13307 from gliptak/seriescomp1 and squashes the following commits: 4967db4 [Gábor Lipták] Fix series comparison operators when dealing with zero rank numpy arrays --- doc/source/whatsnew/v0.18.2.txt | 3 ++- pandas/core/ops.py | 5 ++++- pandas/tests/series/test_operators.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index b87cdd91aa464..2f6afa8ed2ad0 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -101,7 +101,7 @@ API changes - Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) - Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) .. _whatsnew_0182.api.tolist: @@ -368,6 +368,7 @@ Bug Fixes - Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) - Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d1bb67fa0bc13..f27a83f50e115 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -754,7 +754,10 @@ def wrapper(self, other, axis=None): elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): - if len(self) != len(other): + # do not check length of zerodim array + # as it will broadcast + if (not lib.isscalar(lib.item_from_zerodim(other)) and + len(self) != len(other)): raise ValueError('Lengths must match to compare') return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 3588faa8b42f1..1e23c87fdb4ca 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -264,6 +264,18 @@ def test_operators_timedelta64(self): rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) self.assertEqual(rs[2], value) + def test_operator_series_comparison_zerorank(self): + # GH 13006 + result = np.float64(0) > pd.Series([1, 2, 3]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + result = pd.Series([1, 2, 3]) < np.float64(0) + expected = pd.Series([1, 2, 3]) < 0.0 + self.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + def test_timedeltas_with_DateOffset(self): # GH 4532 From 103f7d31e1b850e532ed85a4b53ef222d1271c54 Mon Sep 17 00:00:00 2001 From: Chris Warth Date: Fri, 3 Jun 2016 16:57:24 -0400 Subject: [PATCH 86/96] DOC: Add example usage to DataFrame.filter Author: Chris Warth Closes #12399 from cswarth/doc/df_filter and squashes the following commits: f48e9ff [Chris Warth] DOC: Add example usage to DataFrame.filter --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/core/generic.py | 56 ++++++++++++++++--- .../tests/frame/test_axis_select_reindex.py | 16 ++++++ 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2f6afa8ed2ad0..7493150370e9f 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -286,6 +286,7 @@ Other API changes - ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) - ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) .. _whatsnew_0182.deprecations: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ecaaebc2b523..0852c5a293f4e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2357,7 +2357,11 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): def filter(self, items=None, like=None, regex=None, axis=None): """ - Restrict the info axis to set of items or wildcard + Subset rows or columns of dataframe according to labels in + the specified index. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. Parameters ---------- @@ -2367,19 +2371,57 @@ def filter(self, items=None, like=None, regex=None, axis=None): Keep info axis where "arg in col == True" regex : string (regular expression) Keep info axis with re.search(regex, col) == True - axis : int or None - The axis to filter on. By default this is the info axis. The "info - axis" is the axis that is used when indexing with ``[]``. For - example, ``df = DataFrame({'a': [1, 2, 3, 4]]}); df['a']``. So, - the ``DataFrame`` columns are the info axis. + axis : int or string axis name + The axis to filter on. By default this is the info axis, + 'index' for Series, 'columns' for DataFrame + + Returns + ------- + same type as input object + + Examples + -------- + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + + See Also + -------- + pandas.DataFrame.select Notes ----- - Arguments are mutually exclusive, but this is not checked for + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. """ import re + nkw = sum([x is not None for x in [items, like, regex]]) + if nkw > 1: + raise TypeError('Keyword arguments `items`, `like`, or `regex` ' + 'are mutually exclusive') + if axis is None: axis = self._info_axis_name axis_name = self._get_axis_name(axis) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 07fe28f13b7d0..9da1b31d259c5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -661,8 +661,24 @@ def test_filter(self): assert_frame_equal(filtered, expected) # pass in None + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter() with assertRaisesRegexp(TypeError, 'Must pass'): self.frame.filter(items=None) + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter(axis=1) + + # test mutually exclusive arguments + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', axis=1) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi', axis=0) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') From faf9b7d3218bc25068692ebc273f4c6942382a84 Mon Sep 17 00:00:00 2001 From: babakkeyvani Date: Sun, 5 Jun 2016 09:50:35 -0400 Subject: [PATCH 87/96] DOC: Fixed a minor typo Author: babakkeyvani Closes #13366 from bkeyvani/master and squashes the following commits: 029ade7 [babakkeyvani] DOC: Fixed a minor typo --- doc/README.rst | 2 +- doc/source/contributing.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/README.rst b/doc/README.rst index 06d95e6b9c44d..a93ad32a4c8f8 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -160,7 +160,7 @@ and `Good as first PR `_ where you could start out. -Or maybe you have an idea of you own, by using pandas, looking for something +Or maybe you have an idea of your own, by using pandas, looking for something in the documentation and thinking 'this can be improved', let's do something about that! diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index e64ff4c155132..a9b86925666b7 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -21,7 +21,7 @@ and `Difficulty Novice `_ where you could start out. -Or maybe through using *pandas* you have an idea of you own or are looking for something +Or maybe through using *pandas* you have an idea of your own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! From eca7891c5e6bf1ea8fd1460ab6be171769616a73 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 5 Jun 2016 09:54:49 -0400 Subject: [PATCH 88/96] DOC: document doublequote in read_csv Title is self-explanatory. Author: gfyoung Closes #13368 from gfyoung/doublequote-doc and squashes the following commits: f3e01fc [gfyoung] DOC: document doublequote in read_csv --- doc/source/io.rst | 4 ++++ pandas/io/parsers.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/doc/source/io.rst b/doc/source/io.rst index 4eb42e1fb918d..79867d33c5838 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -273,6 +273,10 @@ quoting : int or ``csv.QUOTE_*`` instance, default ``None`` ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` behavior. +doublequote : boolean, default ``True`` + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` elements + **inside** a field as a single ``quotechar`` element. escapechar : str (length 1), default ``None`` One-character string used to escape delimiter when quoting is ``QUOTE_NONE``. comment : str, default ``None`` diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2c8726f588522..150e5ba5e1521 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -192,6 +192,10 @@ Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). Default (None) results in QUOTE_MINIMAL behavior. +doublequote : boolean, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. escapechar : str (length 1), default None One-character string used to escape delimiter when quoting is QUOTE_NONE. comment : str, default None From 863cbc571b17a1734d813a45201b8158643ce3e2 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sun, 5 Jun 2016 09:56:35 -0400 Subject: [PATCH 89/96] DEPR, DOC: Deprecate buffer_lines in read_csv `buffer_lines` is not respected, as it is determined internally via a heuristic involving `table_width` (see here for how it is computed). Author: gfyoung Closes #13360 from gfyoung/buffer-lines-depr-doc and squashes the following commits: a72ecbe [gfyoung] DEPR, DOC: Deprecate buffer_lines in read_csv --- doc/source/io.rst | 6 ++++++ doc/source/whatsnew/v0.18.2.txt | 3 ++- pandas/io/parsers.py | 11 +++++++++-- pandas/io/tests/parser/test_parsers.py | 2 -- pandas/io/tests/parser/test_unsupported.py | 5 +++++ 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 79867d33c5838..f559c3cb3ebaf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -176,6 +176,12 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If ``low_memory`` is ``True``, specify the number of rows to be read for + each chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 7493150370e9f..2f841fa6b6e18 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -293,7 +293,8 @@ Other API changes Deprecations ^^^^^^^^^^^^ -- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`) +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) +- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) .. _whatsnew_0182.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 150e5ba5e1521..a851a5f48f5e6 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -231,6 +231,12 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser + + If low_memory is True, specify the number of rows to be read for each + chunk. (Only valid with C parser) compact_ints : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -238,7 +244,6 @@ the parser will attempt to cast it as the smallest integer dtype possible, either signed or unsigned depending on the specification from the `use_unsigned` parameter. - use_unsigned : boolean, default False DEPRECATED: this argument will be removed in a future version @@ -452,6 +457,7 @@ def _read(filepath_or_buffer, kwds): 'float_precision', ]) _deprecated_args = set([ + 'buffer_lines', 'compact_ints', 'use_unsigned', ]) @@ -810,7 +816,8 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) for arg in _deprecated_args: - if result[arg] != _c_parser_defaults[arg]: + parser_default = _c_parser_defaults[arg] + if result.get(arg, parser_default) != parser_default: warnings.warn("The '{arg}' argument has been deprecated " "and will be removed in a future version" .format(arg=arg), FutureWarning, stacklevel=2) diff --git a/pandas/io/tests/parser/test_parsers.py b/pandas/io/tests/parser/test_parsers.py index ea8ce9b616f36..fda7b28769647 100644 --- a/pandas/io/tests/parser/test_parsers.py +++ b/pandas/io/tests/parser/test_parsers.py @@ -72,14 +72,12 @@ def read_csv(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = self.low_memory - kwds['buffer_lines'] = 2 return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = self.engine kwds['low_memory'] = True - kwds['buffer_lines'] = 2 return read_table(*args, **kwds) diff --git a/pandas/io/tests/parser/test_unsupported.py b/pandas/io/tests/parser/test_unsupported.py index e820924d2be8b..97862ffa90cef 100644 --- a/pandas/io/tests/parser/test_unsupported.py +++ b/pandas/io/tests/parser/test_unsupported.py @@ -124,6 +124,7 @@ def test_deprecated_args(self): # deprecated arguments with non-default values deprecated = { + 'buffer_lines': True, 'compact_ints': True, 'use_unsigned': True, } @@ -132,6 +133,10 @@ def test_deprecated_args(self): for engine in engines: for arg, non_default_val in deprecated.items(): + if engine == 'python' and arg == 'buffer_lines': + # unsupported --> exception is raised first + continue + with tm.assert_produces_warning( FutureWarning, check_stacklevel=False): kwargs = {arg: non_default_val} From 5a9b498e43a41744470732438e9422a407b0b380 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Sun, 5 Jun 2016 10:04:11 -0400 Subject: [PATCH 90/96] BUG: Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns closes #13231 Author: Christian Hudon Closes #13359 from chrish42/gh13231 and squashes the following commits: e839638 [Christian Hudon] Raise a better exception when the HDF file is empty and kwy=None. 611aa28 [Christian Hudon] Formatting fixes. e7c8313 [Christian Hudon] Add changelog entry. df10016 [Christian Hudon] Make logic that detects if there is only one dataset in a HDF5 file work when storing a dataframe that contains categorical data. 2f41aef [Christian Hudon] Tweak comment to be clearer. b3a5773 [Christian Hudon] Add test that fails for GitHub bug #13231 02f90d5 [Christian Hudon] Use if-expression. --- doc/source/whatsnew/v0.18.2.txt | 1 + pandas/io/pytables.py | 33 +++++++++++++++++++++++++++----- pandas/io/tests/test_pytables.py | 25 ++++++++++++++++++++++-- 3 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 2f841fa6b6e18..93aedce07da9d 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -342,6 +342,7 @@ Bug Fixes - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fcf5125d956c6..cbe04349b5105 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -331,11 +331,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: if key is None: - keys = store.keys() - if len(keys) != 1: - raise ValueError('key must be provided when HDF file contains ' - 'multiple datasets.') - key = keys[0] + groups = store.groups() + if len(groups) == 0: + raise ValueError('No dataset in HDF5 file.') + candidate_only_group = groups[0] + + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError('key must be provided when HDF5 file ' + 'contains multiple datasets.') + key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except: # if there is an error, close the store @@ -347,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): raise +def _is_metadata_of(group, parent_group): + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == 'meta': + return True + current = current._v_parent + return False + + class HDFStore(StringMixin): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 96b66265ea586..9c13162bd774c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -46,8 +46,8 @@ from distutils.version import LooseVersion -_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ - and 'blosc' or 'zlib' +_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' + else 'zlib') _multiprocess_can_split_ = False @@ -4877,6 +4877,9 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) @@ -4884,6 +4887,24 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_table(self): + # GH13231 + df = DataFrame({'i': range(5), + 'c': Series(list('abacd'), dtype='category')}) + + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='a', format='table') + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, 'df2', mode='a', format='table') + self.assertRaises(ValueError, read_hdf, path) + + def test_read_nokey_empty(self): + with ensure_clean_path(self.path) as path: + store = HDFStore(path) + store.close() + self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): # GH11773 From e90d411714e7deac73e3e6b763ba9dccd3549871 Mon Sep 17 00:00:00 2001 From: Stewart Henderson Date: Sun, 5 Jun 2016 13:06:10 -0500 Subject: [PATCH 91/96] DOC: remove obsolete cron job script (#13369) * Typo correction * removed deprecated script --- ci/cron/go_doc.sh | 99 ----------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100755 ci/cron/go_doc.sh diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh deleted file mode 100755 index 89659577d0e7f..0000000000000 --- a/ci/cron/go_doc.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# This is a one-command cron job for setting up -# a virtualenv-based, linux-based, py2-based environment -# for building the Pandas documentation. -# -# The first run will install all required deps from pypi -# into the venv including monsters like scipy. -# You may want to set it up yourself to speed up the -# process. -# -# This is meant to be run as a cron job under a dedicated -# user account whose HOME directory contains this script. -# a CI directory will be created under it and all files -# stored within it. -# -# The hardcoded dep versions will gradually become obsolete -# You may need to tweak them -# -# @y-p, Jan/2014 - -# disto latex is sometimes finicky. Optionall use -# a local texlive install -export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH - -# Having ccache will speed things up -export PATH=/usr/lib64/ccache/:$PATH - -# limit disk usage -ccache -M 200M - -BASEDIR="$HOME/CI" -REPO_URL="https://github.com/pydata/pandas" -REPO_LOC="$BASEDIR/pandas" - -if [ ! -d $BASEDIR ]; then - mkdir -p $BASEDIR - virtualenv $BASEDIR/venv -fi - -source $BASEDIR/venv/bin/activate - -pip install numpy==1.7.2 -pip install cython==0.20.0 -pip install python-dateutil==2.2 -pip install --pre pytz==2013.9 -pip install sphinx==1.1.3 -pip install numexpr==2.2.2 - -pip install matplotlib==1.3.0 -pip install lxml==3.2.5 -pip install beautifulsoup4==4.3.2 -pip install html5lib==0.99 - -# You'll need R as well -pip install rpy2==2.3.9 - -pip install tables==3.0.0 -pip install bottleneck==0.7.0 -pip install ipython==0.13.2 - -# only if you have too -pip install scipy==0.13.2 - -pip install openpyxl==1.6.2 -pip install xlrd==0.9.2 -pip install xlwt==0.7.5 -pip install xlsxwriter==0.5.1 -pip install sqlalchemy==0.8.3 - -if [ ! -d "$REPO_LOC" ]; then - git clone "$REPO_URL" "$REPO_LOC" -fi - -cd "$REPO_LOC" -git reset --hard -git clean -df -git checkout master -git pull origin -make - -source $BASEDIR/venv/bin/activate -export PATH="/usr/lib64/ccache/:$PATH" -pip uninstall pandas -yq -pip install "$REPO_LOC" - -cd "$REPO_LOC"/doc - -python make.py clean -python make.py html -if [ ! $? == 0 ]; then - exit 1 -fi -python make.py zip_html -# usually requires manual intervention -# python make.py latex - -# If you have access: -# python make.py upload_dev From b722222f5ea760a3f3df4d063309949eb4956674 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 5 Jun 2016 17:56:40 -0400 Subject: [PATCH 92/96] CLN: remove old skiplist code Author: Jeff Reback Closes #13372 from jreback/skiplist and squashes the following commits: e05ea24 [Jeff Reback] CLN: remove old skiplist code --- pandas/algos.pyx | 44 -------------------------------------------- 1 file changed, 44 deletions(-) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index a31b35ba4afc6..7884d9c41845c 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1505,52 +1505,8 @@ def roll_kurt(ndarray[double_t] input, #------------------------------------------------------------------------------- # Rolling median, min, max -ctypedef double_t (* skiplist_f)(object sl, int n, int p) - -cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): - cdef ndarray[double_t] input = arg - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = op(skiplist, nobs, minp) - - return output - from skiplist cimport * - @cython.boundscheck(False) @cython.wraparound(False) def roll_median_c(ndarray[float64_t] arg, int win, int minp): From 3600bcaaf824b312e65fc2c5e96a026dfe28bbba Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 5 Jun 2016 19:25:02 -0400 Subject: [PATCH 93/96] ENH: incorporate PR feedback; GH7271 --- doc/source/whatsnew/v0.18.2.txt | 1 - pandas/core/generic.py | 47 ++++++++++-------------------- pandas/tests/frame/test_dtypes.py | 42 ++++++-------------------- pandas/tests/series/test_dtypes.py | 14 --------- 4 files changed, 25 insertions(+), 79 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index d3d0ed4ba86d1..8ce877fd75019 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -30,7 +30,6 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) -- The `copy` argument to the ``astype()`` functions has been deprecated in favor of a new ``inplace`` argument. (:issue:`12086`) - ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca090634e524f..00cae9602f8e3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2931,44 +2931,33 @@ def blocks(self): """Internal property, property synonym for as_blocks()""" return self.as_blocks() - def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, - **kwargs): + def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): """ Cast object to input numpy.dtype + Return a copy when copy = True (be really careful with this!) Parameters ---------- - dtype : numpy.dtype or Python type (to cast entire DataFrame to the - same type). Alternatively, {col: dtype, ...}, where col is a column - label and dtype is a numpy.dtype or Python type (to cast one or - more of the DataFrame's columns to column-specific types). - copy : deprecated; use inplace instead - inplace : boolean, default False - Modify the NDFrame in place (do not create a new object) + dtype : numpy.dtype, Python type, or dict + Use a numpy.dtype or Python type to cast entire pandas object to the + same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor if - inplace=False + kwargs : keyword arguments to pass on to the constructor Returns ------- - casted : type of caller (if inplace=False) or None (if inplace=True) + casted : type of caller """ if isinstance(dtype, collections.Mapping): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or list(dtype.keys())[0] != self.name: - if raise_on_error: - raise KeyError('Only the Series name can be used for ' - 'the key in Series dtype mappings.') - return - for key, value in dtype.items(): - return self.astype(value, copy, inplace, raise_on_error, - **kwargs) - - if inplace: - for col, typ in dtype.items(): - self[col].astype(typ, inplace=True, - raise_on_error=raise_on_error) - return + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + typ = list(dtype.values())[0] + return self.astype(typ, copy, raise_on_error, **kwargs) + from pandas.tools.merge import concat casted_cols = [self[col].astype(typ, copy=copy) for col, typ in dtype.items()] @@ -2979,13 +2968,9 @@ def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, return new_df.reindex(columns=self.columns, copy=False) # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=not inplace, + new_data = self._data.astype(dtype=dtype, copy=copy, raise_on_error=raise_on_error, **kwargs) - if inplace: - self._update_inplace(new_data) - return - else: - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 6c581298531fa..fef1b95550e8a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -391,6 +391,15 @@ def test_astype_dict(self): assert_frame_equal(result, expected) assert_frame_equal(df, original) + result = df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}) + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + # change all columns assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), df.astype(str)) @@ -408,39 +417,6 @@ def test_astype_dict(self): assert_frame_equal(df, equiv) assert_frame_equal(df, original) - # using inplace=True, the df should be changed - output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) - expected = DataFrame({ - 'a': a, - 'b': Series(['0', '1', '2', '3', '4']), - 'c': c, - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) - self.assertEqual(output, None) - assert_frame_equal(df, expected) - - df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}, - inplace=True) - expected = DataFrame({ - 'a': a, - 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), - 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) - assert_frame_equal(df, expected) - - def test_astype_inplace(self): - # GH7271 - df = DataFrame({'a': range(10), - 'b': range(2, 12), - 'c': np.arange(4.0, 14.0, dtype='float64')}) - df.astype('float', inplace=True) - for col in df.columns: - self.assertTrue(df[col].map(lambda x: type(x) == float).all()) - self.assertEqual(df[col].dtype, 'float64') - df.astype('str', inplace=True) - for col in df.columns: - self.assertTrue(df[col].map(lambda x: type(x) == str).all()) - self.assertEqual(df[col].dtype, 'object') - def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 4d6f32af26e46..d6cfe1656fbc9 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -148,20 +148,6 @@ def test_astype_dict(self): self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) self.assertRaises(KeyError, s.astype, {0: str}) - def test_astype_inplace(self): - s = Series(np.random.randn(5), name='foo') - dtypes = ['float32', 'float64', 'int64', 'int32'] - - for dtype in dtypes: - result = s.astype(dtype, inplace=False) - self.assertEqual(result.dtype, dtype) - self.assertEqual(result.name, s.name) - - for dtype in dtypes: - s.astype(dtype, inplace=True) - self.assertEqual(s.dtype, dtype) - self.assertEqual(s.name, 'foo') - def test_complexx(self): # GH4819 # complex access for ndarray compat From 29ecec0b9cd34ffb3aa6b1ba93f042a8e2520a96 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 8 May 2016 19:19:30 -0400 Subject: [PATCH 94/96] ENH: inplace dtype changes, df per-column dtype changes; GH7271 --- doc/source/whatsnew/v0.18.2.txt | 4 +- pandas/core/frame.py | 41 +++++++++++++++++ pandas/core/generic.py | 16 +++++-- pandas/tests/frame/test_dtypes.py | 70 ++++++++++++++++++++++++++++++ pandas/tests/series/test_dtypes.py | 13 ++++++ 5 files changed, 137 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 93aedce07da9d..bb3731a9cc472 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -57,8 +57,7 @@ Other enhancements ^^^^^^^^^^^^^^^^^^ - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - -- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) +- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) - ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) .. ipython:: python @@ -79,7 +78,6 @@ Other enhancements - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - .. ipython:: python idx = pd.Index(['a', 'b', 'c']) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69def7502a6f7..6e0f0bdbfed9e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3772,6 +3772,47 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Misc methods + def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, + **kwargs): + """ + Cast object to given data type(s). + + Parameters + ---------- + dtype : numpy.dtype or Python type (to cast entire DataFrame to the + same type). Alternatively, {col: dtype, ...}, where col is a column + label and dtype is a numpy.dtype or Python type (to cast one or + more of the DataFrame's columns to column-specific types). + copy : deprecated; use inplace instead + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + raise_on_error : raise on invalid input + kwargs : keyword arguments to pass on to the constructor if + inplace=False + + Returns + ------- + casted : type of caller + """ + if isinstance(dtype, collections.Mapping): + if inplace: + for col, typ in dtype.items(): + self[col].astype(typ, inplace=True, + raise_on_error=raise_on_error) + return None + else: + from pandas.tools.merge import concat + casted_cols = [self[col].astype(typ, copy=copy) + for col, typ in dtype.items()] + other_col_labels = self.columns.difference(dtype.keys()) + other_cols = [self[col].copy() if copy else self[col] + for col in other_col_labels] + new_df = concat(casted_cols + other_cols, axis=1) + return new_df.reindex(columns=self.columns, copy=False) + df = super(DataFrame, self) + return df.astype(dtype=dtype, copy=copy, inplace=inplace, + raise_on_error=raise_on_error, **kwargs) + def first_valid_index(self): """ Return label for first non-NA/null value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0852c5a293f4e..53d2d6d2fa22a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -144,7 +144,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - """Used when a manipulation result has the same dimesions as the + """Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) @@ -2973,14 +2973,17 @@ def blocks(self): """Internal property, property synonym for as_blocks()""" return self.as_blocks() - def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): + def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, + **kwargs): """ Cast object to input numpy.dtype - Return a copy when copy = True (be really careful with this!) Parameters ---------- dtype : numpy.dtype or Python type + copy : deprecated; use inplace instead + inplace : boolean, default False + Modify the NDFrame in place (do not create a new object) raise_on_error : raise on invalid input kwargs : keyword arguments to pass on to the constructor @@ -2988,7 +2991,12 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): ------- casted : type of caller """ - + if inplace: + new_data = self._data.astype(dtype=dtype, copy=False, + raise_on_error=raise_on_error, + **kwargs) + self._update_inplace(new_data) + return mgr = self._data.astype(dtype=dtype, copy=copy, raise_on_error=raise_on_error, **kwargs) return self._constructor(mgr).__finalize__(self) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5f95ff6b6b601..7ce540604b533 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -398,6 +398,76 @@ def test_astype_str(self): expected = DataFrame(['1.12345678901']) assert_frame_equal(result, expected) + def test_astype_dict(self): + # GH7271 + a = Series(date_range('2010-01-04', periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(['1.0', '2', '3.14', '4', '5.4']) + df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + original = df.copy(deep=True) + + # change type of a subset of columns + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + astyped = df.astype({'b': 'str', 'd': 'float32'}) + assert_frame_equal(astyped, expected) + assert_frame_equal(df, original) + self.assertEqual(astyped.b.dtype, 'object') + self.assertEqual(astyped.d.dtype, 'float32') + + # change all columns + assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), + df.astype(str)) + assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + self.assertRaises(KeyError, df.astype, {'b': str, 2: str}) + self.assertRaises(KeyError, df.astype, {'e': str}) + assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + equiv = df.astype({col: df[col].dtype for col in df.columns}) + assert_frame_equal(df, equiv) + assert_frame_equal(df, original) + + # using inplace=True, the df should be changed + output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) + self.assertEqual(output, None) + assert_frame_equal(df, expected) + df.astype({'b': np.float32, 'c': 'float32', 'd': np.float32}, + inplace=True) + self.assertEqual(df.a.dtype, original.a.dtype) + self.assertEqual(df.b.dtype, 'float32') + self.assertEqual(df.c.dtype, 'float32') + self.assertEqual(df.d.dtype, 'float32') + self.assertEqual(df.b[0], 0.0) + df.astype({'b': str, 'c': 'float64', 'd': np.float64}, inplace=True) + self.assertEqual(df.a.dtype, original.a.dtype) + self.assertEqual(df.b.dtype, 'object') + self.assertEqual(df.c.dtype, 'float64') + self.assertEqual(df.d.dtype, 'float64') + self.assertEqual(df.b[0], '0.0') + + def test_astype_inplace(self): + # GH7271 + df = DataFrame({'a': range(10), + 'b': range(2, 12), + 'c': np.arange(4.0, 14.0, dtype='float64')}) + df.astype('float', inplace=True) + for col in df.columns: + self.assertTrue(df[col].map(lambda x: type(x) == float).all()) + self.assertEqual(df[col].dtype, 'float64') + df.astype('str', inplace=True) + for col in df.columns: + self.assertTrue(df[col].map(lambda x: type(x) == str).all()) + self.assertEqual(df[col].dtype, 'object') + def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 6864eac603ded..273c8d0b440f5 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,6 +133,19 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_inplace(self): + s = Series(np.random.randn(5), name='foo') + + for dtype in ['float32', 'float64', 'int64', 'int32']: + astyped = s.astype(dtype, inplace=False) + self.assertEqual(astyped.dtype, dtype) + self.assertEqual(astyped.name, s.name) + + for dtype in ['float32', 'float64', 'int64', 'int32']: + s.astype(dtype, inplace=True) + self.assertEqual(s.dtype, dtype) + self.assertEqual(s.name, 'foo') + def test_complexx(self): # GH4819 # complex access for ndarray compat From 95a029bd577bffb27a39fb34ee54702da8a9aad3 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Mon, 9 May 2016 22:25:14 -0400 Subject: [PATCH 95/96] ENH: NDFrame astype() now accepts inplace arg and dtype arg can be a mapping of col to type; GH7271 --- pandas/core/frame.py | 41 ------------------------- pandas/core/generic.py | 49 ++++++++++++++++++++++++------ pandas/tests/frame/test_dtypes.py | 31 +++++++++---------- pandas/tests/series/test_dtypes.py | 26 +++++++++++++--- 4 files changed, 75 insertions(+), 72 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6e0f0bdbfed9e..69def7502a6f7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3772,47 +3772,6 @@ def update(self, other, join='left', overwrite=True, filter_func=None, # ---------------------------------------------------------------------- # Misc methods - def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, - **kwargs): - """ - Cast object to given data type(s). - - Parameters - ---------- - dtype : numpy.dtype or Python type (to cast entire DataFrame to the - same type). Alternatively, {col: dtype, ...}, where col is a column - label and dtype is a numpy.dtype or Python type (to cast one or - more of the DataFrame's columns to column-specific types). - copy : deprecated; use inplace instead - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) - raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor if - inplace=False - - Returns - ------- - casted : type of caller - """ - if isinstance(dtype, collections.Mapping): - if inplace: - for col, typ in dtype.items(): - self[col].astype(typ, inplace=True, - raise_on_error=raise_on_error) - return None - else: - from pandas.tools.merge import concat - casted_cols = [self[col].astype(typ, copy=copy) - for col, typ in dtype.items()] - other_col_labels = self.columns.difference(dtype.keys()) - other_cols = [self[col].copy() if copy else self[col] - for col in other_col_labels] - new_df = concat(casted_cols + other_cols, axis=1) - return new_df.reindex(columns=self.columns, copy=False) - df = super(DataFrame, self) - return df.astype(dtype=dtype, copy=copy, inplace=inplace, - raise_on_error=raise_on_error, **kwargs) - def first_valid_index(self): """ Return label for first non-NA/null value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 53d2d6d2fa22a..46d289644cdca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,4 +1,5 @@ # pylint: disable=W0231,E1101 +import collections import warnings import operator import weakref @@ -2980,26 +2981,54 @@ def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, Parameters ---------- - dtype : numpy.dtype or Python type + dtype : numpy.dtype or Python type (to cast entire DataFrame to the + same type). Alternatively, {col: dtype, ...}, where col is a column + label and dtype is a numpy.dtype or Python type (to cast one or + more of the DataFrame's columns to column-specific types). copy : deprecated; use inplace instead inplace : boolean, default False Modify the NDFrame in place (do not create a new object) raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor + kwargs : keyword arguments to pass on to the constructor if + inplace=False Returns ------- - casted : type of caller - """ + casted : type of caller (if inplace=False) or None (if inplace=True) + """ + if isinstance(dtype, collections.Mapping): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or list(dtype.keys())[0] != self.name: + if raise_on_error: + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + return + for key, value in dtype.items(): + return self.astype(value, copy, inplace, raise_on_error, + **kwargs) + + if inplace: + for col, typ in dtype.items(): + self[col].astype(typ, inplace=True, + raise_on_error=raise_on_error) + return + from pandas.tools.merge import concat + casted_cols = [self[col].astype(typ, copy=copy) + for col, typ in dtype.items()] + other_col_labels = self.columns.difference(dtype.keys()) + other_cols = [self[col].copy() if copy else self[col] + for col in other_col_labels] + new_df = concat(casted_cols + other_cols, axis=1) + return new_df.reindex(columns=self.columns, copy=False) + + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=not inplace, + raise_on_error=raise_on_error, **kwargs) if inplace: - new_data = self._data.astype(dtype=dtype, copy=False, - raise_on_error=raise_on_error, - **kwargs) self._update_inplace(new_data) return - mgr = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) - return self._constructor(mgr).__finalize__(self) + else: + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 7ce540604b533..89ba8d8c98c5c 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -408,16 +408,14 @@ def test_astype_dict(self): original = df.copy(deep=True) # change type of a subset of columns + result = df.astype({'b': 'str', 'd': 'float32'}) expected = DataFrame({ 'a': a, 'b': Series(['0', '1', '2', '3', '4']), 'c': c, 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) - astyped = df.astype({'b': 'str', 'd': 'float32'}) - assert_frame_equal(astyped, expected) + assert_frame_equal(result, expected) assert_frame_equal(df, original) - self.assertEqual(astyped.b.dtype, 'object') - self.assertEqual(astyped.d.dtype, 'float32') # change all columns assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), @@ -438,21 +436,22 @@ def test_astype_dict(self): # using inplace=True, the df should be changed output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) self.assertEqual(output, None) assert_frame_equal(df, expected) - df.astype({'b': np.float32, 'c': 'float32', 'd': np.float32}, + + df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}, inplace=True) - self.assertEqual(df.a.dtype, original.a.dtype) - self.assertEqual(df.b.dtype, 'float32') - self.assertEqual(df.c.dtype, 'float32') - self.assertEqual(df.d.dtype, 'float32') - self.assertEqual(df.b[0], 0.0) - df.astype({'b': str, 'c': 'float64', 'd': np.float64}, inplace=True) - self.assertEqual(df.a.dtype, original.a.dtype) - self.assertEqual(df.b.dtype, 'object') - self.assertEqual(df.c.dtype, 'float64') - self.assertEqual(df.d.dtype, 'float64') - self.assertEqual(df.b[0], '0.0') + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(df, expected) def test_astype_inplace(self): # GH7271 diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 273c8d0b440f5..387dbfee2b048 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,15 +133,31 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_dict(self): + s = Series(range(0, 10, 2), name='abc') + + result = s.astype({'abc': str}) + expected = Series(['0', '2', '4', '6', '8'], name='abc') + assert_series_equal(result, expected) + + result = s.astype({'abc': 'float64'}) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', + name='abc') + assert_series_equal(result, expected) + + self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) + self.assertRaises(KeyError, s.astype, {0: str}) + def test_astype_inplace(self): s = Series(np.random.randn(5), name='foo') + dtypes = ['float32', 'float64', 'int64', 'int32'] - for dtype in ['float32', 'float64', 'int64', 'int32']: - astyped = s.astype(dtype, inplace=False) - self.assertEqual(astyped.dtype, dtype) - self.assertEqual(astyped.name, s.name) + for dtype in dtypes: + result = s.astype(dtype, inplace=False) + self.assertEqual(result.dtype, dtype) + self.assertEqual(result.name, s.name) - for dtype in ['float32', 'float64', 'int64', 'int32']: + for dtype in dtypes: s.astype(dtype, inplace=True) self.assertEqual(s.dtype, dtype) self.assertEqual(s.name, 'foo') From 9d8e1b52048e98d3d12711b4cb5bd7b1e29ea572 Mon Sep 17 00:00:00 2001 From: Stephen Kappel Date: Sun, 5 Jun 2016 19:25:02 -0400 Subject: [PATCH 96/96] ENH: incorporate PR feedback; GH7271 --- pandas/core/generic.py | 47 ++++++++++-------------------- pandas/tests/frame/test_dtypes.py | 42 ++++++-------------------- pandas/tests/series/test_dtypes.py | 14 --------- 3 files changed, 25 insertions(+), 78 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 46d289644cdca..6f062a28b8dc7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2974,44 +2974,33 @@ def blocks(self): """Internal property, property synonym for as_blocks()""" return self.as_blocks() - def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, - **kwargs): + def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): """ Cast object to input numpy.dtype + Return a copy when copy = True (be really careful with this!) Parameters ---------- - dtype : numpy.dtype or Python type (to cast entire DataFrame to the - same type). Alternatively, {col: dtype, ...}, where col is a column - label and dtype is a numpy.dtype or Python type (to cast one or - more of the DataFrame's columns to column-specific types). - copy : deprecated; use inplace instead - inplace : boolean, default False - Modify the NDFrame in place (do not create a new object) + dtype : numpy.dtype, Python type, or dict + Use a numpy.dtype or Python type to cast entire pandas object to the + same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. raise_on_error : raise on invalid input - kwargs : keyword arguments to pass on to the constructor if - inplace=False + kwargs : keyword arguments to pass on to the constructor Returns ------- - casted : type of caller (if inplace=False) or None (if inplace=True) + casted : type of caller """ if isinstance(dtype, collections.Mapping): if self.ndim == 1: # i.e. Series if len(dtype) > 1 or list(dtype.keys())[0] != self.name: - if raise_on_error: - raise KeyError('Only the Series name can be used for ' - 'the key in Series dtype mappings.') - return - for key, value in dtype.items(): - return self.astype(value, copy, inplace, raise_on_error, - **kwargs) - - if inplace: - for col, typ in dtype.items(): - self[col].astype(typ, inplace=True, - raise_on_error=raise_on_error) - return + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + typ = list(dtype.values())[0] + return self.astype(typ, copy, raise_on_error, **kwargs) + from pandas.tools.merge import concat casted_cols = [self[col].astype(typ, copy=copy) for col, typ in dtype.items()] @@ -3022,13 +3011,9 @@ def astype(self, dtype, copy=True, inplace=False, raise_on_error=True, return new_df.reindex(columns=self.columns, copy=False) # else, only a single dtype is given - new_data = self._data.astype(dtype=dtype, copy=not inplace, + new_data = self._data.astype(dtype=dtype, copy=copy, raise_on_error=raise_on_error, **kwargs) - if inplace: - self._update_inplace(new_data) - return - else: - return self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 89ba8d8c98c5c..ab0cf04308bac 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -417,6 +417,15 @@ def test_astype_dict(self): assert_frame_equal(result, expected) assert_frame_equal(df, original) + result = df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}) + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + # change all columns assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), df.astype(str)) @@ -434,39 +443,6 @@ def test_astype_dict(self): assert_frame_equal(df, equiv) assert_frame_equal(df, original) - # using inplace=True, the df should be changed - output = df.astype({'b': 'str', 'd': 'float32'}, inplace=True) - expected = DataFrame({ - 'a': a, - 'b': Series(['0', '1', '2', '3', '4']), - 'c': c, - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) - self.assertEqual(output, None) - assert_frame_equal(df, expected) - - df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}, - inplace=True) - expected = DataFrame({ - 'a': a, - 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), - 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), - 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) - assert_frame_equal(df, expected) - - def test_astype_inplace(self): - # GH7271 - df = DataFrame({'a': range(10), - 'b': range(2, 12), - 'c': np.arange(4.0, 14.0, dtype='float64')}) - df.astype('float', inplace=True) - for col in df.columns: - self.assertTrue(df[col].map(lambda x: type(x) == float).all()) - self.assertEqual(df[col].dtype, 'float64') - df.astype('str', inplace=True) - for col in df.columns: - self.assertTrue(df[col].map(lambda x: type(x) == str).all()) - self.assertEqual(df[col].dtype, 'object') - def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 387dbfee2b048..5194a29bc8b42 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -148,20 +148,6 @@ def test_astype_dict(self): self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) self.assertRaises(KeyError, s.astype, {0: str}) - def test_astype_inplace(self): - s = Series(np.random.randn(5), name='foo') - dtypes = ['float32', 'float64', 'int64', 'int32'] - - for dtype in dtypes: - result = s.astype(dtype, inplace=False) - self.assertEqual(result.dtype, dtype) - self.assertEqual(result.name, s.name) - - for dtype in dtypes: - s.astype(dtype, inplace=True) - self.assertEqual(s.dtype, dtype) - self.assertEqual(s.name, 'foo') - def test_complexx(self): # GH4819 # complex access for ndarray compat